From c63dbfdeb3f4849fd1ac7b2f5a3cd4d171933c8d Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 10 May 2022 17:51:27 -0400 Subject: [PATCH 01/58] Move Huffman tree functions to separate header. --- CMakeLists.txt | 1 + include/huffman.hpp | 21 +++ src/compressors.cpp | 352 +----------------------------------------- src/huffman.cpp | 362 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 385 insertions(+), 351 deletions(-) create mode 100644 include/huffman.hpp create mode 100644 src/huffman.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e8067bda25..21d3e50e21 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,6 +201,7 @@ set( MGARD_LIBRARY_CPP src/compress.cpp src/compress_internal.cpp + src/huffman.cpp src/compressors.cpp src/format.cpp ) diff --git a/include/huffman.hpp b/include/huffman.hpp new file mode 100644 index 0000000000..67bd1bf2fd --- /dev/null +++ b/include/huffman.hpp @@ -0,0 +1,21 @@ +#ifndef HUFFMAN_HPP +#define HUFFMAN_HPP +//!\file +//!\brief Huffman trees for quantized multilevel coefficients. + +namespace mgard { + +void huffman_encoding(long int *quantized_data, const std::size_t n, + unsigned char **out_data_hit, size_t *out_data_hit_size, + unsigned char **out_data_miss, size_t *out_data_miss_size, + unsigned char **out_tree, size_t *out_tree_size); + +void huffman_decoding(long int *quantized_data, + const std::size_t quantized_data_size, + unsigned char *out_data_hit, size_t out_data_hit_size, + unsigned char *out_data_miss, size_t out_data_miss_size, + unsigned char *out_tree, size_t out_tree_size); + +} // namespace mgard + +#endif diff --git a/src/compressors.cpp b/src/compressors.cpp index 915912f7f9..34c3401e14 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -7,13 +7,13 @@ #include #include #include -#include #include #include #include #include "format.hpp" +#include "huffman.hpp" #ifdef MGARD_TIMING #include @@ -26,251 +26,6 @@ namespace mgard { -const int nql = 32768 * 4; - -struct htree_node { - int q; - size_t cnt; - unsigned int code; - size_t len; - htree_node *left; - htree_node *right; -}; - -struct huffman_codec { - int q; - unsigned int code; - size_t len; -}; - -bool myfunction(htree_node i, htree_node j) { return (i.cnt < j.cnt); } - -htree_node *new_htree_node(int q, size_t cnt) { - htree_node *new_node = new htree_node; - new_node->q = q; - new_node->cnt = cnt; - new_node->code = 0; - new_node->len = 0; - new_node->left = 0; - new_node->right = 0; - - return new_node; -} - -struct LessThanByCnt { - bool operator()(const htree_node *lhs, const htree_node *rhs) const { - return lhs->cnt > rhs->cnt; - } -}; - -template -using my_priority_queue = - std::priority_queue, LessThanByCnt>; - -void build_codec(htree_node *root, unsigned int code, size_t len, - huffman_codec *codec) { - - root->len = len; - root->code = code; - - if (!root->left && !root->right) { - codec[root->q].q = root->q; - codec[root->q].code = code; - codec[root->q].len = len; - } - - if (root->left) { - build_codec(root->left, code << 1, len + 1, codec); - } - - if (root->right) { - build_codec(root->right, code << 1 | 0x1, len + 1, codec); - } -} - -my_priority_queue *build_tree(size_t *cnt) { - my_priority_queue *phtree; - phtree = new my_priority_queue; -#if 1 - for (int i = 0; i < nql; i++) { - if (cnt[i] != 0) { - htree_node *new_node = new_htree_node(i, cnt[i]); - phtree->push(new_node); - } - } - - while (phtree->size() > 1) { - htree_node *top_node1 = phtree->top(); - phtree->pop(); - htree_node *top_node2 = phtree->top(); - phtree->pop(); - - htree_node *new_node = new_htree_node(-1, top_node1->cnt + top_node2->cnt); - new_node->left = top_node1; - new_node->right = top_node2; - phtree->push(new_node); - } -#endif - return phtree; -} - -void free_htree_node(htree_node *node) { - if (node->left) { - free_htree_node(node->left); - node->left = 0; - } - - if (node->right) { - free_htree_node(node->right); - node->right = 0; - } - - delete node; -} - -void free_tree(my_priority_queue *phtree) { - if (phtree) { - free_htree_node(phtree->top()); - - phtree->pop(); - - delete phtree; - } -} - -// Note this function will change the quantized data. -size_t *build_ft(long int *quantized_data, const std::size_t n, - size_t &num_outliers) { - size_t *cnt = (size_t *)malloc(nql * sizeof(size_t)); - std::memset(cnt, 0, nql * sizeof(size_t)); - - for (std::size_t i = 0; i < n; i++) { - // Convert quantization level to positive so that counting freq can be - // easily done. Level 0 is reserved a out-of-range flag. - quantized_data[i] = quantized_data[i] + nql / 2; - if (quantized_data[i] > 0 && quantized_data[i] < nql) { - cnt[quantized_data[i]]++; - } else { - cnt[0]++; - } - } - - num_outliers = cnt[0]; - - return cnt; -} - -huffman_codec *build_huffman_codec(long int *quantized_data, size_t **ft, - const std::size_t n, size_t &num_outliers) { - size_t *cnt; - - cnt = build_ft(quantized_data, n, num_outliers); - *ft = cnt; - - my_priority_queue *phtree = build_tree(cnt); - - huffman_codec *codec = (huffman_codec *)malloc(sizeof(huffman_codec) * nql); - std::memset(codec, 0, sizeof(huffman_codec) * nql); - - build_codec(phtree->top(), 0, 0, codec); - - free_tree(phtree); - phtree = 0; - - return codec; -} - -void huffman_decoding(long int *quantized_data, - const std::size_t quantized_data_size, - unsigned char *out_data_hit, size_t out_data_hit_size, - unsigned char *out_data_miss, size_t out_data_miss_size, - unsigned char *out_tree, size_t out_tree_size) { - size_t *cft = (size_t *)out_tree; - int nonZeros = out_tree_size / (2 * sizeof(size_t)); - size_t *ft = (size_t *)malloc(nql * sizeof(size_t)); - - std::memset(ft, 0, nql * sizeof(size_t)); - - for (int j = 0; j < nonZeros; j++) { - ft[cft[2 * j]] = cft[2 * j + 1]; - } - - my_priority_queue *phtree = build_tree(ft); - - unsigned int *buf = (unsigned int *)out_data_hit; - - // The out_data_miss may not be aligned. Therefore, the code - // here makes a new buffer. - int *miss_buf = (int *)malloc(out_data_miss_size); - if (out_data_miss_size) { - std::memcpy(miss_buf, out_data_miss, out_data_miss_size); - } - - int *miss_bufp = miss_buf; - - size_t start_bit = 0; - unsigned int mask = 0x80000000; - - long int *q = quantized_data; - size_t i = 0; - size_t num_missed = 0; - while (q < (quantized_data + (quantized_data_size / sizeof(*q)))) { - htree_node *root = phtree->top(); - assert(root); - - size_t len = 0; - int offset = 0; - while (root->left) { - int flag = *(buf + start_bit / 32 + offset) & mask; - if (!flag) { - root = root->left; - } else { - root = root->right; - } - - len++; - - mask >>= 1; - if (!mask) { - mask = 0x80000000; - offset = 1; - } else { - // offset = 0; - } - } - - if (root->q != 0) { - *q = root->q - nql / 2; - - } else { - *q = *miss_buf - nql / 2; - - miss_buf++; - num_missed++; - } - - q++; - i++; - - start_bit += len; - } - - assert(start_bit == out_data_hit_size); - assert(sizeof(int) * num_missed == out_data_miss_size); - - // Avoid unused argument warning. If NDEBUG is defined, then the assert - // becomes empty and out_data_hit_size is unused. Tell the compiler that - // is OK and expected. - (void)out_data_hit_size; - - free(miss_bufp); - miss_bufp = 0; - free_tree(phtree); - phtree = 0; - free(ft); - ft = 0; -} - void decompress_memory_huffman(unsigned char *const src, const std::size_t srcLen, long int *const dst, const std::size_t dstLen) { @@ -313,111 +68,6 @@ void decompress_memory_huffman(unsigned char *const src, free(huffman_encoding_p); } -void huffman_encoding(long int *quantized_data, const std::size_t n, - unsigned char **out_data_hit, size_t *out_data_hit_size, - unsigned char **out_data_miss, size_t *out_data_miss_size, - unsigned char **out_tree, size_t *out_tree_size) { - size_t num_miss = 0; - size_t *ft = 0; - - huffman_codec *codec = build_huffman_codec(quantized_data, &ft, n, num_miss); - - assert(n >= num_miss); - - /* For those miss points, we still need to maintain a flag (q = 0), - * and therefore we need to allocate space for n numbers. - */ - unsigned char *p_hit = (unsigned char *)malloc(n * sizeof(int)); - std::memset(p_hit, 0, n * sizeof(int)); - - int *p_miss = 0; - if (num_miss > 0) { - p_miss = (int *)malloc(num_miss * sizeof(int)); - std::memset(p_miss, 0, num_miss * sizeof(int)); - } - - *out_data_hit = p_hit; - *out_data_miss = (unsigned char *)p_miss; - *out_data_hit_size = 0; - *out_data_miss_size = 0; - - size_t start_bit = 0; - unsigned int *cur = (unsigned int *)p_hit; - size_t cnt_missed = 0; - for (std::size_t i = 0; i < n; i++) { - int q = quantized_data[i]; - unsigned int code; - size_t len; - - if (q > 0 && q < nql) { - // for those that are within the range - code = codec[q].code; - len = codec[q].len; - } else { - // for those that are out of the range, q is set to 0 - code = codec[0].code; - len = codec[0].len; - - *p_miss = q; - p_miss++; - cnt_missed++; - } - - // Note that if len == 0, then that means that either the data is all the - // same number or (more likely) all data are outside the quantization - // range. Either way, the code contains no information and is therefore 0 - // bits. - - if (32 - start_bit % 32 < len) { - // current unsigned int cannot hold the code - // copy 32 - start_bit % 32 bits to the current int - // and copy the rest len - (32 - start_bit % 32) to the next int - size_t rshift = len - (32 - start_bit % 32); - size_t lshift = 32 - rshift; - *(cur + start_bit / 32) = (*(cur + start_bit / 32)) | (code >> rshift); - *(cur + start_bit / 32 + 1) = - (*(cur + start_bit / 32 + 1)) | (code << lshift); - start_bit += len; - } else if (len > 0) { - code = code << (32 - start_bit % 32 - len); - *(cur + start_bit / 32) = (*(cur + start_bit / 32)) | code; - start_bit += len; - } else { - // Sequence is empty (everything must be the same). Do nothing. - } - } - - // Note: hit size is in bits, while miss size is in bytes. - *out_data_hit_size = start_bit; - *out_data_miss_size = num_miss * sizeof(int); - - // write frequency table to buffer - int nonZeros = 0; - for (int i = 0; i < nql; i++) { - if (ft[i] > 0) { - nonZeros++; - } - } - - size_t *cft = (size_t *)malloc(2 * nonZeros * sizeof(size_t)); - int off = 0; - for (int i = 0; i < nql; i++) { - if (ft[i] > 0) { - cft[2 * off] = i; - cft[2 * off + 1] = ft[i]; - off++; - } - } - - *out_tree = (unsigned char *)cft; - *out_tree_size = 2 * nonZeros * sizeof(size_t); - free(ft); - ft = 0; - - free(codec); - codec = 0; -} - MemoryBuffer compress_memory_huffman(long int *const src, const std::size_t srcLen) { unsigned char *out_data_hit = 0; diff --git a/src/huffman.cpp b/src/huffman.cpp new file mode 100644 index 0000000000..6fc1dbe1d3 --- /dev/null +++ b/src/huffman.cpp @@ -0,0 +1,362 @@ +#include +#include +#include +#include + +#include + +#include "huffman.hpp" + +namespace mgard { + +const int nql = 32768 * 4; + +struct htree_node { + int q; + size_t cnt; + unsigned int code; + size_t len; + htree_node *left; + htree_node *right; +}; + +struct huffman_codec { + int q; + unsigned int code; + size_t len; +}; + +bool myfunction(htree_node i, htree_node j) { return (i.cnt < j.cnt); } + +htree_node *new_htree_node(int q, size_t cnt) { + htree_node *new_node = new htree_node; + new_node->q = q; + new_node->cnt = cnt; + new_node->code = 0; + new_node->len = 0; + new_node->left = 0; + new_node->right = 0; + + return new_node; +} + +struct LessThanByCnt { + bool operator()(const htree_node *lhs, const htree_node *rhs) const { + return lhs->cnt > rhs->cnt; + } +}; + +template +using my_priority_queue = + std::priority_queue, LessThanByCnt>; + +void build_codec(htree_node *root, unsigned int code, size_t len, + huffman_codec *codec) { + + root->len = len; + root->code = code; + + if (!root->left && !root->right) { + codec[root->q].q = root->q; + codec[root->q].code = code; + codec[root->q].len = len; + } + + if (root->left) { + build_codec(root->left, code << 1, len + 1, codec); + } + + if (root->right) { + build_codec(root->right, code << 1 | 0x1, len + 1, codec); + } +} + +my_priority_queue *build_tree(size_t *cnt) { + my_priority_queue *phtree; + phtree = new my_priority_queue; +#if 1 + for (int i = 0; i < nql; i++) { + if (cnt[i] != 0) { + htree_node *new_node = new_htree_node(i, cnt[i]); + phtree->push(new_node); + } + } + + while (phtree->size() > 1) { + htree_node *top_node1 = phtree->top(); + phtree->pop(); + htree_node *top_node2 = phtree->top(); + phtree->pop(); + + htree_node *new_node = new_htree_node(-1, top_node1->cnt + top_node2->cnt); + new_node->left = top_node1; + new_node->right = top_node2; + phtree->push(new_node); + } +#endif + return phtree; +} + +void free_htree_node(htree_node *node) { + if (node->left) { + free_htree_node(node->left); + node->left = 0; + } + + if (node->right) { + free_htree_node(node->right); + node->right = 0; + } + + delete node; +} + +void free_tree(my_priority_queue *phtree) { + if (phtree) { + free_htree_node(phtree->top()); + + phtree->pop(); + + delete phtree; + } +} + +// Note this function will change the quantized data. +size_t *build_ft(long int *quantized_data, const std::size_t n, + size_t &num_outliers) { + size_t *cnt = (size_t *)malloc(nql * sizeof(size_t)); + std::memset(cnt, 0, nql * sizeof(size_t)); + + for (std::size_t i = 0; i < n; i++) { + // Convert quantization level to positive so that counting freq can be + // easily done. Level 0 is reserved a out-of-range flag. + quantized_data[i] = quantized_data[i] + nql / 2; + if (quantized_data[i] > 0 && quantized_data[i] < nql) { + cnt[quantized_data[i]]++; + } else { + cnt[0]++; + } + } + + num_outliers = cnt[0]; + + return cnt; +} + +huffman_codec *build_huffman_codec(long int *quantized_data, size_t **ft, + const std::size_t n, size_t &num_outliers) { + size_t *cnt; + + cnt = build_ft(quantized_data, n, num_outliers); + *ft = cnt; + + my_priority_queue *phtree = build_tree(cnt); + + huffman_codec *codec = (huffman_codec *)malloc(sizeof(huffman_codec) * nql); + std::memset(codec, 0, sizeof(huffman_codec) * nql); + + build_codec(phtree->top(), 0, 0, codec); + + free_tree(phtree); + phtree = 0; + + return codec; +} + +void huffman_encoding(long int *quantized_data, const std::size_t n, + unsigned char **out_data_hit, size_t *out_data_hit_size, + unsigned char **out_data_miss, size_t *out_data_miss_size, + unsigned char **out_tree, size_t *out_tree_size) { + size_t num_miss = 0; + size_t *ft = 0; + + huffman_codec *codec = build_huffman_codec(quantized_data, &ft, n, num_miss); + + assert(n >= num_miss); + + /* For those miss points, we still need to maintain a flag (q = 0), + * and therefore we need to allocate space for n numbers. + */ + unsigned char *p_hit = (unsigned char *)malloc(n * sizeof(int)); + std::memset(p_hit, 0, n * sizeof(int)); + + int *p_miss = 0; + if (num_miss > 0) { + p_miss = (int *)malloc(num_miss * sizeof(int)); + std::memset(p_miss, 0, num_miss * sizeof(int)); + } + + *out_data_hit = p_hit; + *out_data_miss = (unsigned char *)p_miss; + *out_data_hit_size = 0; + *out_data_miss_size = 0; + + size_t start_bit = 0; + unsigned int *cur = (unsigned int *)p_hit; + size_t cnt_missed = 0; + for (std::size_t i = 0; i < n; i++) { + int q = quantized_data[i]; + unsigned int code; + size_t len; + + if (q > 0 && q < nql) { + // for those that are within the range + code = codec[q].code; + len = codec[q].len; + } else { + // for those that are out of the range, q is set to 0 + code = codec[0].code; + len = codec[0].len; + + *p_miss = q; + p_miss++; + cnt_missed++; + } + + // Note that if len == 0, then that means that either the data is all the + // same number or (more likely) all data are outside the quantization + // range. Either way, the code contains no information and is therefore 0 + // bits. + + if (32 - start_bit % 32 < len) { + // current unsigned int cannot hold the code + // copy 32 - start_bit % 32 bits to the current int + // and copy the rest len - (32 - start_bit % 32) to the next int + size_t rshift = len - (32 - start_bit % 32); + size_t lshift = 32 - rshift; + *(cur + start_bit / 32) = (*(cur + start_bit / 32)) | (code >> rshift); + *(cur + start_bit / 32 + 1) = + (*(cur + start_bit / 32 + 1)) | (code << lshift); + start_bit += len; + } else if (len > 0) { + code = code << (32 - start_bit % 32 - len); + *(cur + start_bit / 32) = (*(cur + start_bit / 32)) | code; + start_bit += len; + } else { + // Sequence is empty (everything must be the same). Do nothing. + } + } + + // Note: hit size is in bits, while miss size is in bytes. + *out_data_hit_size = start_bit; + *out_data_miss_size = num_miss * sizeof(int); + + // write frequency table to buffer + int nonZeros = 0; + for (int i = 0; i < nql; i++) { + if (ft[i] > 0) { + nonZeros++; + } + } + + size_t *cft = (size_t *)malloc(2 * nonZeros * sizeof(size_t)); + int off = 0; + for (int i = 0; i < nql; i++) { + if (ft[i] > 0) { + cft[2 * off] = i; + cft[2 * off + 1] = ft[i]; + off++; + } + } + + *out_tree = (unsigned char *)cft; + *out_tree_size = 2 * nonZeros * sizeof(size_t); + free(ft); + ft = 0; + + free(codec); + codec = 0; +} + +void huffman_decoding(long int *quantized_data, + const std::size_t quantized_data_size, + unsigned char *out_data_hit, size_t out_data_hit_size, + unsigned char *out_data_miss, size_t out_data_miss_size, + unsigned char *out_tree, size_t out_tree_size) { + size_t *cft = (size_t *)out_tree; + int nonZeros = out_tree_size / (2 * sizeof(size_t)); + size_t *ft = (size_t *)malloc(nql * sizeof(size_t)); + + std::memset(ft, 0, nql * sizeof(size_t)); + + for (int j = 0; j < nonZeros; j++) { + ft[cft[2 * j]] = cft[2 * j + 1]; + } + + my_priority_queue *phtree = build_tree(ft); + + unsigned int *buf = (unsigned int *)out_data_hit; + + // The out_data_miss may not be aligned. Therefore, the code + // here makes a new buffer. + int *miss_buf = (int *)malloc(out_data_miss_size); + if (out_data_miss_size) { + std::memcpy(miss_buf, out_data_miss, out_data_miss_size); + } + + int *miss_bufp = miss_buf; + + size_t start_bit = 0; + unsigned int mask = 0x80000000; + + long int *q = quantized_data; + size_t i = 0; + size_t num_missed = 0; + while (q < (quantized_data + (quantized_data_size / sizeof(*q)))) { + htree_node *root = phtree->top(); + assert(root); + + size_t len = 0; + int offset = 0; + while (root->left) { + int flag = *(buf + start_bit / 32 + offset) & mask; + if (!flag) { + root = root->left; + } else { + root = root->right; + } + + len++; + + mask >>= 1; + if (!mask) { + mask = 0x80000000; + offset = 1; + } else { + // offset = 0; + } + } + + if (root->q != 0) { + *q = root->q - nql / 2; + + } else { + *q = *miss_buf - nql / 2; + + miss_buf++; + num_missed++; + } + + q++; + i++; + + start_bit += len; + } + + assert(start_bit == out_data_hit_size); + assert(sizeof(int) * num_missed == out_data_miss_size); + + // Avoid unused argument warning. If NDEBUG is defined, then the assert + // becomes empty and out_data_hit_size is unused. Tell the compiler that + // is OK and expected. + (void)out_data_hit_size; + + free(miss_bufp); + miss_bufp = 0; + free_tree(phtree); + phtree = 0; + free(ft); + ft = 0; +} + +} // namespace mgard From f43b69f369e5cb9fc911669f1e83fac6f6fbc793 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 10 May 2022 17:55:25 -0400 Subject: [PATCH 02/58] Delete unused comparison function. --- src/huffman.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/huffman.cpp b/src/huffman.cpp index 6fc1dbe1d3..55788050ba 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -26,8 +26,6 @@ struct huffman_codec { size_t len; }; -bool myfunction(htree_node i, htree_node j) { return (i.cnt < j.cnt); } - htree_node *new_htree_node(int q, size_t cnt) { htree_node *new_node = new htree_node; new_node->q = q; From 258bec855bfd5d1a00a7019cca277884545f5679 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 10 May 2022 17:56:39 -0400 Subject: [PATCH 03/58] Replace `size_t` with `std::size_t`. --- src/huffman.cpp | 77 ++++++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/src/huffman.cpp b/src/huffman.cpp index 55788050ba..5fb40c2f23 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -13,9 +13,9 @@ const int nql = 32768 * 4; struct htree_node { int q; - size_t cnt; + std::size_t cnt; unsigned int code; - size_t len; + std::size_t len; htree_node *left; htree_node *right; }; @@ -23,10 +23,10 @@ struct htree_node { struct huffman_codec { int q; unsigned int code; - size_t len; + std::size_t len; }; -htree_node *new_htree_node(int q, size_t cnt) { +htree_node *new_htree_node(int q, std::size_t cnt) { htree_node *new_node = new htree_node; new_node->q = q; new_node->cnt = cnt; @@ -48,7 +48,7 @@ template using my_priority_queue = std::priority_queue, LessThanByCnt>; -void build_codec(htree_node *root, unsigned int code, size_t len, +void build_codec(htree_node *root, unsigned int code, std::size_t len, huffman_codec *codec) { root->len = len; @@ -69,7 +69,7 @@ void build_codec(htree_node *root, unsigned int code, size_t len, } } -my_priority_queue *build_tree(size_t *cnt) { +my_priority_queue *build_tree(std::size_t *cnt) { my_priority_queue *phtree; phtree = new my_priority_queue; #if 1 @@ -120,10 +120,10 @@ void free_tree(my_priority_queue *phtree) { } // Note this function will change the quantized data. -size_t *build_ft(long int *quantized_data, const std::size_t n, - size_t &num_outliers) { - size_t *cnt = (size_t *)malloc(nql * sizeof(size_t)); - std::memset(cnt, 0, nql * sizeof(size_t)); +std::size_t *build_ft(long int *quantized_data, const std::size_t n, + std::size_t &num_outliers) { + std::size_t *cnt = (std::size_t *)malloc(nql * sizeof(std::size_t)); + std::memset(cnt, 0, nql * sizeof(std::size_t)); for (std::size_t i = 0; i < n; i++) { // Convert quantization level to positive so that counting freq can be @@ -141,9 +141,10 @@ size_t *build_ft(long int *quantized_data, const std::size_t n, return cnt; } -huffman_codec *build_huffman_codec(long int *quantized_data, size_t **ft, - const std::size_t n, size_t &num_outliers) { - size_t *cnt; +huffman_codec *build_huffman_codec(long int *quantized_data, std::size_t **ft, + const std::size_t n, + std::size_t &num_outliers) { + std::size_t *cnt; cnt = build_ft(quantized_data, n, num_outliers); *ft = cnt; @@ -162,11 +163,13 @@ huffman_codec *build_huffman_codec(long int *quantized_data, size_t **ft, } void huffman_encoding(long int *quantized_data, const std::size_t n, - unsigned char **out_data_hit, size_t *out_data_hit_size, - unsigned char **out_data_miss, size_t *out_data_miss_size, - unsigned char **out_tree, size_t *out_tree_size) { - size_t num_miss = 0; - size_t *ft = 0; + unsigned char **out_data_hit, + std::size_t *out_data_hit_size, + unsigned char **out_data_miss, + std::size_t *out_data_miss_size, unsigned char **out_tree, + std::size_t *out_tree_size) { + std::size_t num_miss = 0; + std::size_t *ft = 0; huffman_codec *codec = build_huffman_codec(quantized_data, &ft, n, num_miss); @@ -189,13 +192,13 @@ void huffman_encoding(long int *quantized_data, const std::size_t n, *out_data_hit_size = 0; *out_data_miss_size = 0; - size_t start_bit = 0; + std::size_t start_bit = 0; unsigned int *cur = (unsigned int *)p_hit; - size_t cnt_missed = 0; + std::size_t cnt_missed = 0; for (std::size_t i = 0; i < n; i++) { int q = quantized_data[i]; unsigned int code; - size_t len; + std::size_t len; if (q > 0 && q < nql) { // for those that are within the range @@ -220,8 +223,8 @@ void huffman_encoding(long int *quantized_data, const std::size_t n, // current unsigned int cannot hold the code // copy 32 - start_bit % 32 bits to the current int // and copy the rest len - (32 - start_bit % 32) to the next int - size_t rshift = len - (32 - start_bit % 32); - size_t lshift = 32 - rshift; + std::size_t rshift = len - (32 - start_bit % 32); + std::size_t lshift = 32 - rshift; *(cur + start_bit / 32) = (*(cur + start_bit / 32)) | (code >> rshift); *(cur + start_bit / 32 + 1) = (*(cur + start_bit / 32 + 1)) | (code << lshift); @@ -247,7 +250,7 @@ void huffman_encoding(long int *quantized_data, const std::size_t n, } } - size_t *cft = (size_t *)malloc(2 * nonZeros * sizeof(size_t)); + std::size_t *cft = (std::size_t *)malloc(2 * nonZeros * sizeof(std::size_t)); int off = 0; for (int i = 0; i < nql; i++) { if (ft[i] > 0) { @@ -258,7 +261,7 @@ void huffman_encoding(long int *quantized_data, const std::size_t n, } *out_tree = (unsigned char *)cft; - *out_tree_size = 2 * nonZeros * sizeof(size_t); + *out_tree_size = 2 * nonZeros * sizeof(std::size_t); free(ft); ft = 0; @@ -268,14 +271,16 @@ void huffman_encoding(long int *quantized_data, const std::size_t n, void huffman_decoding(long int *quantized_data, const std::size_t quantized_data_size, - unsigned char *out_data_hit, size_t out_data_hit_size, - unsigned char *out_data_miss, size_t out_data_miss_size, - unsigned char *out_tree, size_t out_tree_size) { - size_t *cft = (size_t *)out_tree; - int nonZeros = out_tree_size / (2 * sizeof(size_t)); - size_t *ft = (size_t *)malloc(nql * sizeof(size_t)); + unsigned char *out_data_hit, + std::size_t out_data_hit_size, + unsigned char *out_data_miss, + std::size_t out_data_miss_size, unsigned char *out_tree, + std::size_t out_tree_size) { + std::size_t *cft = (std::size_t *)out_tree; + int nonZeros = out_tree_size / (2 * sizeof(std::size_t)); + std::size_t *ft = (std::size_t *)malloc(nql * sizeof(std::size_t)); - std::memset(ft, 0, nql * sizeof(size_t)); + std::memset(ft, 0, nql * sizeof(std::size_t)); for (int j = 0; j < nonZeros; j++) { ft[cft[2 * j]] = cft[2 * j + 1]; @@ -294,17 +299,17 @@ void huffman_decoding(long int *quantized_data, int *miss_bufp = miss_buf; - size_t start_bit = 0; + std::size_t start_bit = 0; unsigned int mask = 0x80000000; long int *q = quantized_data; - size_t i = 0; - size_t num_missed = 0; + std::size_t i = 0; + std::size_t num_missed = 0; while (q < (quantized_data + (quantized_data_size / sizeof(*q)))) { htree_node *root = phtree->top(); assert(root); - size_t len = 0; + std::size_t len = 0; int offset = 0; while (root->left) { int flag = *(buf + start_bit / 32 + offset) & mask; From bf1d545bfa7955297f2fc1c4c8a788464f9e7e1f Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 11 May 2022 10:35:04 -0400 Subject: [PATCH 04/58] Replace `malloc` calls with `new` expressions. --- src/huffman.cpp | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/src/huffman.cpp b/src/huffman.cpp index 5fb40c2f23..d523c4f797 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -122,8 +122,9 @@ void free_tree(my_priority_queue *phtree) { // Note this function will change the quantized data. std::size_t *build_ft(long int *quantized_data, const std::size_t n, std::size_t &num_outliers) { - std::size_t *cnt = (std::size_t *)malloc(nql * sizeof(std::size_t)); - std::memset(cnt, 0, nql * sizeof(std::size_t)); + // The elements of the array are value-initialized (which, because they have + // scalar type, is zero-initialized). + std::size_t *const cnt = new std::size_t[nql](); for (std::size_t i = 0; i < n; i++) { // Convert quantization level to positive so that counting freq can be @@ -151,8 +152,10 @@ huffman_codec *build_huffman_codec(long int *quantized_data, std::size_t **ft, my_priority_queue *phtree = build_tree(cnt); - huffman_codec *codec = (huffman_codec *)malloc(sizeof(huffman_codec) * nql); - std::memset(codec, 0, sizeof(huffman_codec) * nql); + // Each element of the array is value-initialized. Since `huffman_codec` has + // an implicitly-defined default constructor, value-initialization is zero- + // initialization. I am, of course, not sure about this. + huffman_codec *const codec = new huffman_codec[nql](); build_codec(phtree->top(), 0, 0, codec); @@ -171,29 +174,30 @@ void huffman_encoding(long int *quantized_data, const std::size_t n, std::size_t num_miss = 0; std::size_t *ft = 0; - huffman_codec *codec = build_huffman_codec(quantized_data, &ft, n, num_miss); + huffman_codec *const codec = + build_huffman_codec(quantized_data, &ft, n, num_miss); assert(n >= num_miss); /* For those miss points, we still need to maintain a flag (q = 0), * and therefore we need to allocate space for n numbers. */ - unsigned char *p_hit = (unsigned char *)malloc(n * sizeof(int)); - std::memset(p_hit, 0, n * sizeof(int)); + // The elements of the array are value-initialized (here, zero-initialized). + unsigned int *const p_hit = new unsigned int[n](); int *p_miss = 0; if (num_miss > 0) { - p_miss = (int *)malloc(num_miss * sizeof(int)); - std::memset(p_miss, 0, num_miss * sizeof(int)); + // The elements of the array are value-initialized (here, zero-initialized). + p_miss = new int[num_miss](); } - *out_data_hit = p_hit; + *out_data_hit = reinterpret_cast(p_hit); *out_data_miss = (unsigned char *)p_miss; *out_data_hit_size = 0; *out_data_miss_size = 0; std::size_t start_bit = 0; - unsigned int *cur = (unsigned int *)p_hit; + unsigned int *cur = p_hit; std::size_t cnt_missed = 0; for (std::size_t i = 0; i < n; i++) { int q = quantized_data[i]; @@ -250,7 +254,7 @@ void huffman_encoding(long int *quantized_data, const std::size_t n, } } - std::size_t *cft = (std::size_t *)malloc(2 * nonZeros * sizeof(std::size_t)); + std::size_t *const cft = new std::size_t[2 * nonZeros]; int off = 0; for (int i = 0; i < nql; i++) { if (ft[i] > 0) { @@ -262,11 +266,10 @@ void huffman_encoding(long int *quantized_data, const std::size_t n, *out_tree = (unsigned char *)cft; *out_tree_size = 2 * nonZeros * sizeof(std::size_t); - free(ft); + delete[] ft; ft = 0; - free(codec); - codec = 0; + delete[] codec; } void huffman_decoding(long int *quantized_data, @@ -278,9 +281,8 @@ void huffman_decoding(long int *quantized_data, std::size_t out_tree_size) { std::size_t *cft = (std::size_t *)out_tree; int nonZeros = out_tree_size / (2 * sizeof(std::size_t)); - std::size_t *ft = (std::size_t *)malloc(nql * sizeof(std::size_t)); - - std::memset(ft, 0, nql * sizeof(std::size_t)); + // The elements of the array are value-initialized (here, zero-initialized). + std::size_t *const ft = new std::size_t[nql](); for (int j = 0; j < nonZeros; j++) { ft[cft[2 * j]] = cft[2 * j + 1]; @@ -292,12 +294,13 @@ void huffman_decoding(long int *quantized_data, // The out_data_miss may not be aligned. Therefore, the code // here makes a new buffer. - int *miss_buf = (int *)malloc(out_data_miss_size); + assert(not(out_data_miss_size % sizeof(int))); + int *miss_buf = new int[out_data_miss_size / sizeof(int)]; if (out_data_miss_size) { std::memcpy(miss_buf, out_data_miss, out_data_miss_size); } - int *miss_bufp = miss_buf; + int *const miss_bufp = miss_buf; std::size_t start_bit = 0; unsigned int mask = 0x80000000; @@ -354,12 +357,10 @@ void huffman_decoding(long int *quantized_data, // is OK and expected. (void)out_data_hit_size; - free(miss_bufp); - miss_bufp = 0; + delete[] miss_bufp; free_tree(phtree); phtree = 0; - free(ft); - ft = 0; + delete[] ft; } } // namespace mgard From 444541d5ba14cb2c3a8c8fc513f3214f58bc8c38 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 11 May 2022 10:49:48 -0400 Subject: [PATCH 05/58] Replace `new_htree_node` with a constructor. --- src/huffman.cpp | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/huffman.cpp b/src/huffman.cpp index d523c4f797..69f0f73ef0 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -12,6 +12,10 @@ namespace mgard { const int nql = 32768 * 4; struct htree_node { + //! Constructor. + htree_node(const int q, const std::size_t cnt) + : q(q), cnt(cnt), code(0), len(0), left(nullptr), right(nullptr) {} + int q; std::size_t cnt; unsigned int code; @@ -26,18 +30,6 @@ struct huffman_codec { std::size_t len; }; -htree_node *new_htree_node(int q, std::size_t cnt) { - htree_node *new_node = new htree_node; - new_node->q = q; - new_node->cnt = cnt; - new_node->code = 0; - new_node->len = 0; - new_node->left = 0; - new_node->right = 0; - - return new_node; -} - struct LessThanByCnt { bool operator()(const htree_node *lhs, const htree_node *rhs) const { return lhs->cnt > rhs->cnt; @@ -75,7 +67,7 @@ my_priority_queue *build_tree(std::size_t *cnt) { #if 1 for (int i = 0; i < nql; i++) { if (cnt[i] != 0) { - htree_node *new_node = new_htree_node(i, cnt[i]); + htree_node *const new_node = new htree_node(i, cnt[i]); phtree->push(new_node); } } @@ -86,7 +78,8 @@ my_priority_queue *build_tree(std::size_t *cnt) { htree_node *top_node2 = phtree->top(); phtree->pop(); - htree_node *new_node = new_htree_node(-1, top_node1->cnt + top_node2->cnt); + htree_node *const new_node = + new htree_node(-1, top_node1->cnt + top_node2->cnt); new_node->left = top_node1; new_node->right = top_node2; phtree->push(new_node); From 362a5d519d14d0cfb99ba4f46cf9a3550c6c0a8e Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 11 May 2022 11:15:09 -0400 Subject: [PATCH 06/58] Add `const` to Huffman tree variable types. --- include/huffman.hpp | 12 ++++---- src/huffman.cpp | 72 ++++++++++++++++++++++----------------------- 2 files changed, 41 insertions(+), 43 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 67bd1bf2fd..9f66780e0d 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -5,16 +5,16 @@ namespace mgard { -void huffman_encoding(long int *quantized_data, const std::size_t n, +void huffman_encoding(long int *const quantized_data, const std::size_t n, unsigned char **out_data_hit, size_t *out_data_hit_size, unsigned char **out_data_miss, size_t *out_data_miss_size, unsigned char **out_tree, size_t *out_tree_size); -void huffman_decoding(long int *quantized_data, - const std::size_t quantized_data_size, - unsigned char *out_data_hit, size_t out_data_hit_size, - unsigned char *out_data_miss, size_t out_data_miss_size, - unsigned char *out_tree, size_t out_tree_size); +void huffman_decoding( + long int *const quantized_data, const std::size_t quantized_data_size, + unsigned char const *const out_data_hit, const size_t out_data_hit_size, + unsigned char const *const out_data_miss, const size_t out_data_miss_size, + unsigned char const *const out_tree, const size_t out_tree_size); } // namespace mgard diff --git a/src/huffman.cpp b/src/huffman.cpp index 69f0f73ef0..85d3b97a8d 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -31,7 +31,8 @@ struct huffman_codec { }; struct LessThanByCnt { - bool operator()(const htree_node *lhs, const htree_node *rhs) const { + bool operator()(htree_node const *const lhs, + htree_node const *const rhs) const { return lhs->cnt > rhs->cnt; } }; @@ -40,8 +41,8 @@ template using my_priority_queue = std::priority_queue, LessThanByCnt>; -void build_codec(htree_node *root, unsigned int code, std::size_t len, - huffman_codec *codec) { +void build_codec(htree_node *const root, const unsigned int code, + const std::size_t len, huffman_codec *const codec) { root->len = len; root->code = code; @@ -61,9 +62,9 @@ void build_codec(htree_node *root, unsigned int code, std::size_t len, } } -my_priority_queue *build_tree(std::size_t *cnt) { - my_priority_queue *phtree; - phtree = new my_priority_queue; +my_priority_queue *build_tree(std::size_t const *const cnt) { + my_priority_queue *const phtree = + new my_priority_queue; #if 1 for (int i = 0; i < nql; i++) { if (cnt[i] != 0) { @@ -73,9 +74,9 @@ my_priority_queue *build_tree(std::size_t *cnt) { } while (phtree->size() > 1) { - htree_node *top_node1 = phtree->top(); + htree_node *const top_node1 = phtree->top(); phtree->pop(); - htree_node *top_node2 = phtree->top(); + htree_node *const top_node2 = phtree->top(); phtree->pop(); htree_node *const new_node = @@ -88,7 +89,7 @@ my_priority_queue *build_tree(std::size_t *cnt) { return phtree; } -void free_htree_node(htree_node *node) { +void free_htree_node(htree_node *const node) { if (node->left) { free_htree_node(node->left); node->left = 0; @@ -102,7 +103,7 @@ void free_htree_node(htree_node *node) { delete node; } -void free_tree(my_priority_queue *phtree) { +void free_tree(my_priority_queue *const phtree) { if (phtree) { free_htree_node(phtree->top()); @@ -113,7 +114,7 @@ void free_tree(my_priority_queue *phtree) { } // Note this function will change the quantized data. -std::size_t *build_ft(long int *quantized_data, const std::size_t n, +std::size_t *build_ft(long int *const quantized_data, const std::size_t n, std::size_t &num_outliers) { // The elements of the array are value-initialized (which, because they have // scalar type, is zero-initialized). @@ -135,15 +136,13 @@ std::size_t *build_ft(long int *quantized_data, const std::size_t n, return cnt; } -huffman_codec *build_huffman_codec(long int *quantized_data, std::size_t **ft, - const std::size_t n, +huffman_codec *build_huffman_codec(long int *const quantized_data, + std::size_t **ft, const std::size_t n, std::size_t &num_outliers) { - std::size_t *cnt; - - cnt = build_ft(quantized_data, n, num_outliers); + std::size_t *const cnt = build_ft(quantized_data, n, num_outliers); *ft = cnt; - my_priority_queue *phtree = build_tree(cnt); + my_priority_queue *const phtree = build_tree(cnt); // Each element of the array is value-initialized. Since `huffman_codec` has // an implicitly-defined default constructor, value-initialization is zero- @@ -153,12 +152,11 @@ huffman_codec *build_huffman_codec(long int *quantized_data, std::size_t **ft, build_codec(phtree->top(), 0, 0, codec); free_tree(phtree); - phtree = 0; return codec; } -void huffman_encoding(long int *quantized_data, const std::size_t n, +void huffman_encoding(long int *const quantized_data, const std::size_t n, unsigned char **out_data_hit, std::size_t *out_data_hit_size, unsigned char **out_data_miss, @@ -193,7 +191,7 @@ void huffman_encoding(long int *quantized_data, const std::size_t n, unsigned int *cur = p_hit; std::size_t cnt_missed = 0; for (std::size_t i = 0; i < n; i++) { - int q = quantized_data[i]; + const int q = quantized_data[i]; unsigned int code; std::size_t len; @@ -265,15 +263,16 @@ void huffman_encoding(long int *quantized_data, const std::size_t n, delete[] codec; } -void huffman_decoding(long int *quantized_data, +void huffman_decoding(long int *const quantized_data, const std::size_t quantized_data_size, - unsigned char *out_data_hit, - std::size_t out_data_hit_size, - unsigned char *out_data_miss, - std::size_t out_data_miss_size, unsigned char *out_tree, - std::size_t out_tree_size) { - std::size_t *cft = (std::size_t *)out_tree; - int nonZeros = out_tree_size / (2 * sizeof(std::size_t)); + unsigned char const *const out_data_hit, + const std::size_t out_data_hit_size, + unsigned char const *const out_data_miss, + const std::size_t out_data_miss_size, + unsigned char const *const out_tree, + const std::size_t out_tree_size) { + std::size_t const *const cft = (std::size_t const *)out_tree; + const int nonZeros = out_tree_size / (2 * sizeof(std::size_t)); // The elements of the array are value-initialized (here, zero-initialized). std::size_t *const ft = new std::size_t[nql](); @@ -281,19 +280,19 @@ void huffman_decoding(long int *quantized_data, ft[cft[2 * j]] = cft[2 * j + 1]; } - my_priority_queue *phtree = build_tree(ft); + my_priority_queue *const phtree = build_tree(ft); - unsigned int *buf = (unsigned int *)out_data_hit; + unsigned int const *const buf = (unsigned int const *)out_data_hit; // The out_data_miss may not be aligned. Therefore, the code // here makes a new buffer. assert(not(out_data_miss_size % sizeof(int))); - int *miss_buf = new int[out_data_miss_size / sizeof(int)]; + int *const miss_buf = new int[out_data_miss_size / sizeof(int)]; if (out_data_miss_size) { std::memcpy(miss_buf, out_data_miss, out_data_miss_size); } - int *const miss_bufp = miss_buf; + int const *miss_bufp = miss_buf; std::size_t start_bit = 0; unsigned int mask = 0x80000000; @@ -302,7 +301,7 @@ void huffman_decoding(long int *quantized_data, std::size_t i = 0; std::size_t num_missed = 0; while (q < (quantized_data + (quantized_data_size / sizeof(*q)))) { - htree_node *root = phtree->top(); + htree_node const *root = phtree->top(); assert(root); std::size_t len = 0; @@ -330,9 +329,9 @@ void huffman_decoding(long int *quantized_data, *q = root->q - nql / 2; } else { - *q = *miss_buf - nql / 2; + *q = *miss_bufp - nql / 2; - miss_buf++; + miss_bufp++; num_missed++; } @@ -350,9 +349,8 @@ void huffman_decoding(long int *quantized_data, // is OK and expected. (void)out_data_hit_size; - delete[] miss_bufp; + delete[] miss_buf; free_tree(phtree); - phtree = 0; delete[] ft; } From a8d3119671addff6b445c2153381d8826cfc6531 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 11 May 2022 11:18:03 -0400 Subject: [PATCH 07/58] Use `nullptr` instead of `0` for pointer values. --- src/huffman.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/huffman.cpp b/src/huffman.cpp index 85d3b97a8d..981918d376 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -92,12 +92,12 @@ my_priority_queue *build_tree(std::size_t const *const cnt) { void free_htree_node(htree_node *const node) { if (node->left) { free_htree_node(node->left); - node->left = 0; + node->left = nullptr; } if (node->right) { free_htree_node(node->right); - node->right = 0; + node->right = nullptr; } delete node; @@ -163,7 +163,7 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, std::size_t *out_data_miss_size, unsigned char **out_tree, std::size_t *out_tree_size) { std::size_t num_miss = 0; - std::size_t *ft = 0; + std::size_t *ft = nullptr; huffman_codec *const codec = build_huffman_codec(quantized_data, &ft, n, num_miss); @@ -176,7 +176,7 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, // The elements of the array are value-initialized (here, zero-initialized). unsigned int *const p_hit = new unsigned int[n](); - int *p_miss = 0; + int *p_miss = nullptr; if (num_miss > 0) { // The elements of the array are value-initialized (here, zero-initialized). p_miss = new int[num_miss](); @@ -258,7 +258,7 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, *out_tree = (unsigned char *)cft; *out_tree_size = 2 * nonZeros * sizeof(std::size_t); delete[] ft; - ft = 0; + ft = nullptr; delete[] codec; } From 4125d931df455e4c4d2d477430532daa5d374287 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 11 May 2022 11:58:25 -0400 Subject: [PATCH 08/58] Pass `huffman_encoding` parameters by reference. --- include/huffman.hpp | 6 +++--- src/compressors.cpp | 5 ++--- src/huffman.cpp | 32 ++++++++++++++++---------------- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 9f66780e0d..5705c17996 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -6,9 +6,9 @@ namespace mgard { void huffman_encoding(long int *const quantized_data, const std::size_t n, - unsigned char **out_data_hit, size_t *out_data_hit_size, - unsigned char **out_data_miss, size_t *out_data_miss_size, - unsigned char **out_tree, size_t *out_tree_size); + unsigned char *&out_data_hit, size_t &out_data_hit_size, + unsigned char *&out_data_miss, size_t &out_data_miss_size, + unsigned char *&out_tree, size_t &out_tree_size); void huffman_decoding( long int *const quantized_data, const std::size_t quantized_data_size, diff --git a/src/compressors.cpp b/src/compressors.cpp index 34c3401e14..ec5c2323a1 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -79,9 +79,8 @@ MemoryBuffer compress_memory_huffman(long int *const src, #ifdef MGARD_TIMING auto huff_time1 = std::chrono::high_resolution_clock::now(); #endif - huffman_encoding(src, srcLen, &out_data_hit, &out_data_hit_size, - &out_data_miss, &out_data_miss_size, &out_tree, - &out_tree_size); + huffman_encoding(src, srcLen, out_data_hit, out_data_hit_size, out_data_miss, + out_data_miss_size, out_tree, out_tree_size); #ifdef MGARD_TIMING auto huff_time2 = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast( diff --git a/src/huffman.cpp b/src/huffman.cpp index 981918d376..6cbd4fd941 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -137,10 +137,10 @@ std::size_t *build_ft(long int *const quantized_data, const std::size_t n, } huffman_codec *build_huffman_codec(long int *const quantized_data, - std::size_t **ft, const std::size_t n, + std::size_t *&ft, const std::size_t n, std::size_t &num_outliers) { std::size_t *const cnt = build_ft(quantized_data, n, num_outliers); - *ft = cnt; + ft = cnt; my_priority_queue *const phtree = build_tree(cnt); @@ -157,16 +157,16 @@ huffman_codec *build_huffman_codec(long int *const quantized_data, } void huffman_encoding(long int *const quantized_data, const std::size_t n, - unsigned char **out_data_hit, - std::size_t *out_data_hit_size, - unsigned char **out_data_miss, - std::size_t *out_data_miss_size, unsigned char **out_tree, - std::size_t *out_tree_size) { + unsigned char *&out_data_hit, + std::size_t &out_data_hit_size, + unsigned char *&out_data_miss, + std::size_t &out_data_miss_size, unsigned char *&out_tree, + std::size_t &out_tree_size) { std::size_t num_miss = 0; std::size_t *ft = nullptr; huffman_codec *const codec = - build_huffman_codec(quantized_data, &ft, n, num_miss); + build_huffman_codec(quantized_data, ft, n, num_miss); assert(n >= num_miss); @@ -182,10 +182,10 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, p_miss = new int[num_miss](); } - *out_data_hit = reinterpret_cast(p_hit); - *out_data_miss = (unsigned char *)p_miss; - *out_data_hit_size = 0; - *out_data_miss_size = 0; + out_data_hit = reinterpret_cast(p_hit); + out_data_miss = (unsigned char *)p_miss; + out_data_hit_size = 0; + out_data_miss_size = 0; std::size_t start_bit = 0; unsigned int *cur = p_hit; @@ -234,8 +234,8 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, } // Note: hit size is in bits, while miss size is in bytes. - *out_data_hit_size = start_bit; - *out_data_miss_size = num_miss * sizeof(int); + out_data_hit_size = start_bit; + out_data_miss_size = num_miss * sizeof(int); // write frequency table to buffer int nonZeros = 0; @@ -255,8 +255,8 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, } } - *out_tree = (unsigned char *)cft; - *out_tree_size = 2 * nonZeros * sizeof(std::size_t); + out_tree = (unsigned char *)cft; + out_tree_size = 2 * nonZeros * sizeof(std::size_t); delete[] ft; ft = nullptr; From 846e8c73847707daabb63d0d222b27fafe22e4be Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 11 May 2022 12:31:57 -0400 Subject: [PATCH 09/58] Use `std::vector` for Huffman codec array. --- src/huffman.cpp | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/huffman.cpp b/src/huffman.cpp index 6cbd4fd941..8c1a7e082a 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -4,6 +4,7 @@ #include #include +#include #include "huffman.hpp" @@ -41,9 +42,8 @@ template using my_priority_queue = std::priority_queue, LessThanByCnt>; -void build_codec(htree_node *const root, const unsigned int code, - const std::size_t len, huffman_codec *const codec) { - +void initialize_codec(std::vector &codec, htree_node *const root, + const unsigned int code, const std::size_t len) { root->len = len; root->code = code; @@ -54,11 +54,11 @@ void build_codec(htree_node *const root, const unsigned int code, } if (root->left) { - build_codec(root->left, code << 1, len + 1, codec); + initialize_codec(codec, root->left, code << 1, len + 1); } if (root->right) { - build_codec(root->right, code << 1 | 0x1, len + 1, codec); + initialize_codec(codec, root->right, code << 1 | 0x1, len + 1); } } @@ -136,20 +136,20 @@ std::size_t *build_ft(long int *const quantized_data, const std::size_t n, return cnt; } -huffman_codec *build_huffman_codec(long int *const quantized_data, - std::size_t *&ft, const std::size_t n, - std::size_t &num_outliers) { +std::vector build_huffman_codec(long int *const quantized_data, + std::size_t *&ft, + const std::size_t n, + std::size_t &num_outliers) { std::size_t *const cnt = build_ft(quantized_data, n, num_outliers); ft = cnt; my_priority_queue *const phtree = build_tree(cnt); - // Each element of the array is value-initialized. Since `huffman_codec` has + // Each element of the vector is value-initialized. Since `huffman_codec` has // an implicitly-defined default constructor, value-initialization is zero- - // initialization. I am, of course, not sure about this. - huffman_codec *const codec = new huffman_codec[nql](); - - build_codec(phtree->top(), 0, 0, codec); + // initialization. + std::vector codec(nql); + initialize_codec(codec, phtree->top(), 0, 0); free_tree(phtree); @@ -165,7 +165,7 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, std::size_t num_miss = 0; std::size_t *ft = nullptr; - huffman_codec *const codec = + const std::vector codec = build_huffman_codec(quantized_data, ft, n, num_miss); assert(n >= num_miss); @@ -259,8 +259,6 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, out_tree_size = 2 * nonZeros * sizeof(std::size_t); delete[] ft; ft = nullptr; - - delete[] codec; } void huffman_decoding(long int *const quantized_data, From 9ff8b96b6125f69fc29ccac7d6068239793a084c Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 11 May 2022 14:26:30 -0400 Subject: [PATCH 10/58] Gather codecs and frequency table into struct. --- src/huffman.cpp | 97 +++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 48 deletions(-) diff --git a/src/huffman.cpp b/src/huffman.cpp index 8c1a7e082a..44f82bc341 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -3,6 +3,11 @@ #include #include +#ifndef NDEBUG +#include +#endif + +#include #include #include @@ -31,6 +36,13 @@ struct huffman_codec { std::size_t len; }; +template struct HuffmanCodec { + // The arrays are value-initialized, which leads to each of their elements + // being value-initialized (ultimately zero-initialized). + std::array codec{}; + std::array frequency_table{}; +}; + struct LessThanByCnt { bool operator()(htree_node const *const lhs, htree_node const *const rhs) const { @@ -42,15 +54,16 @@ template using my_priority_queue = std::priority_queue, LessThanByCnt>; -void initialize_codec(std::vector &codec, htree_node *const root, +template +void initialize_codec(HuffmanCodec &codec, htree_node *const root, const unsigned int code, const std::size_t len) { root->len = len; root->code = code; if (!root->left && !root->right) { - codec[root->q].q = root->q; - codec[root->q].code = code; - codec[root->q].len = len; + codec.codec[root->q].q = root->q; + codec.codec[root->q].code = code; + codec.codec[root->q].len = len; } if (root->left) { @@ -113,42 +126,35 @@ void free_tree(my_priority_queue *const phtree) { } } -// Note this function will change the quantized data. -std::size_t *build_ft(long int *const quantized_data, const std::size_t n, - std::size_t &num_outliers) { - // The elements of the array are value-initialized (which, because they have - // scalar type, is zero-initialized). - std::size_t *const cnt = new std::size_t[nql](); +// Note: this function will change the quantized data. +template +void initialize_frequency_table(HuffmanCodec &codec, + long int *const quantized_data, + const std::size_t n) { + assert(*std::max_element(codec.frequency_table.begin(), + code.frequency_table.end()) == 0); for (std::size_t i = 0; i < n; i++) { // Convert quantization level to positive so that counting freq can be // easily done. Level 0 is reserved a out-of-range flag. - quantized_data[i] = quantized_data[i] + nql / 2; - if (quantized_data[i] > 0 && quantized_data[i] < nql) { - cnt[quantized_data[i]]++; - } else { - cnt[0]++; - } + quantized_data[i] = quantized_data[i] + NQL / 2; + ++codec.frequency_table[quantized_data[i] > 0 && + quantized_data[i] < + static_cast(NQL) + ? quantized_data[i] + : 0]; } - - num_outliers = cnt[0]; - - return cnt; } -std::vector build_huffman_codec(long int *const quantized_data, - std::size_t *&ft, - const std::size_t n, - std::size_t &num_outliers) { - std::size_t *const cnt = build_ft(quantized_data, n, num_outliers); - ft = cnt; +template +HuffmanCodec build_huffman_codec(long int *const quantized_data, + const std::size_t n) { + HuffmanCodec codec; + initialize_frequency_table(codec, quantized_data, n); - my_priority_queue *const phtree = build_tree(cnt); + my_priority_queue *const phtree = + build_tree(codec.frequency_table.data()); - // Each element of the vector is value-initialized. Since `huffman_codec` has - // an implicitly-defined default constructor, value-initialization is zero- - // initialization. - std::vector codec(nql); initialize_codec(codec, phtree->top(), 0, 0); free_tree(phtree); @@ -162,11 +168,8 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, unsigned char *&out_data_miss, std::size_t &out_data_miss_size, unsigned char *&out_tree, std::size_t &out_tree_size) { - std::size_t num_miss = 0; - std::size_t *ft = nullptr; - - const std::vector codec = - build_huffman_codec(quantized_data, ft, n, num_miss); + const HuffmanCodec codec = build_huffman_codec(quantized_data, n); + const std::size_t num_miss = codec.frequency_table[0]; assert(n >= num_miss); @@ -197,12 +200,12 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, if (q > 0 && q < nql) { // for those that are within the range - code = codec[q].code; - len = codec[q].len; + code = codec.codec[q].code; + len = codec.codec[q].len; } else { // for those that are out of the range, q is set to 0 - code = codec[0].code; - len = codec[0].len; + code = codec.codec[0].code; + len = codec.codec[0].len; *p_miss = q; p_miss++; @@ -218,8 +221,8 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, // current unsigned int cannot hold the code // copy 32 - start_bit % 32 bits to the current int // and copy the rest len - (32 - start_bit % 32) to the next int - std::size_t rshift = len - (32 - start_bit % 32); - std::size_t lshift = 32 - rshift; + const std::size_t rshift = len - (32 - start_bit % 32); + const std::size_t lshift = 32 - rshift; *(cur + start_bit / 32) = (*(cur + start_bit / 32)) | (code >> rshift); *(cur + start_bit / 32 + 1) = (*(cur + start_bit / 32 + 1)) | (code << lshift); @@ -240,7 +243,7 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, // write frequency table to buffer int nonZeros = 0; for (int i = 0; i < nql; i++) { - if (ft[i] > 0) { + if (codec.frequency_table[i] > 0) { nonZeros++; } } @@ -248,17 +251,15 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, std::size_t *const cft = new std::size_t[2 * nonZeros]; int off = 0; for (int i = 0; i < nql; i++) { - if (ft[i] > 0) { + if (codec.frequency_table[i] > 0) { cft[2 * off] = i; - cft[2 * off + 1] = ft[i]; + cft[2 * off + 1] = codec.frequency_table[i]; off++; } } out_tree = (unsigned char *)cft; out_tree_size = 2 * nonZeros * sizeof(std::size_t); - delete[] ft; - ft = nullptr; } void huffman_decoding(long int *const quantized_data, @@ -279,6 +280,7 @@ void huffman_decoding(long int *const quantized_data, } my_priority_queue *const phtree = build_tree(ft); + delete[] ft; unsigned int const *const buf = (unsigned int const *)out_data_hit; @@ -349,7 +351,6 @@ void huffman_decoding(long int *const quantized_data, delete[] miss_buf; free_tree(phtree); - delete[] ft; } } // namespace mgard From 6a249be7e191ac23340ce2d217f52cdf26a17a1b Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Mon, 16 May 2022 12:01:20 -0400 Subject: [PATCH 11/58] Add `Bits` to allow iteration over bits of array. --- CMakeLists.txt | 1 + include/utilities.hpp | 83 ++++++++++++++++++++++++++++++++++++ src/utilities.cpp | 54 +++++++++++++++++++++++ tests/src/test_utilities.cpp | 58 +++++++++++++++++++++++++ 4 files changed, 196 insertions(+) create mode 100644 src/utilities.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 21d3e50e21..9a2902e6db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,6 +201,7 @@ set( MGARD_LIBRARY_CPP src/compress.cpp src/compress_internal.cpp + src/utilities.cpp src/huffman.cpp src/compressors.cpp src/format.cpp diff --git a/include/utilities.hpp b/include/utilities.hpp index 626bc6f235..05b625a950 100644 --- a/include/utilities.hpp +++ b/include/utilities.hpp @@ -449,6 +449,89 @@ template struct MemoryBuffer { std::size_t size; }; +//! Range allowing iteration over the bits in an array. +//! +//! Iterating over this object yields each byte's bits from most to least +//! significant. +class Bits { +public: + //! Constructor. + //! + //!\param begin Pointer to the beginning of the array to be iterated over. + //!\param end Pointer to the end of the array to be iterated over. + Bits(unsigned char const *const begin, unsigned char const *const end); + + //! Equality comparison. + bool operator==(const Bits &other) const; + + //! Inequality comparison. + bool operator!=(const Bits &other) const; + + // Forward declaration. + class iterator; + + //! Return an iterator to the beginning of the bit range. + iterator begin() const; + + //! Return an iterator to the end of the bit range. + iterator end() const; + +private: + //! Pointer to the beginning of the array to be iterated over. + unsigned char const *begin_; + + //! Pointer to the beginning of the array to be iterated over. + unsigned char const *end_; +}; + +//! Iterator over a bit range. +class Bits::iterator { +public: + //! Category of the iterator. + using iterator_category = std::forward_iterator_tag; + //! Type iterated over. + using value_type = bool; + //! Type for distance between iterators. + using difference_type = std::ptrdiff_t; + //! Pointer to `value_type`. + using pointer = value_type *; + //! Type returned by the dereference operator. + using reference = value_type; + + //! Constructor. + //! + //!\param bits Associated bit range. + //!\param p Position in the array being iterated over. + //!\param offset Offset within the current byte. + iterator(const Bits &bits, unsigned char const *const p, + const unsigned char offset); + + //! Equality comparison. + bool operator==(const iterator &other) const; + + //! Inequality comparison. + bool operator!=(const iterator &other) const; + + //! Preincrement. + iterator &operator++(); + + //! Postincrement. + iterator operator++(int); + + //! Dereference. + reference operator*() const; + +private: + //! Associated bit range. + const Bits &iterable; + + //! Position in the array being iterated over. + unsigned char const *p; + + //! Offset within the current byte. + unsigned char offset; +}; + } // namespace mgard #include "utilities.tpp" diff --git a/src/utilities.cpp b/src/utilities.cpp new file mode 100644 index 0000000000..4c3aec863a --- /dev/null +++ b/src/utilities.cpp @@ -0,0 +1,54 @@ +#include "utilities.hpp" + +#include + +#include + +namespace mgard { + +Bits::Bits(unsigned char const *const begin, unsigned char const *const end) + : begin_(begin), end_(end) {} + +bool Bits::operator==(const Bits &other) const { + return begin_ == other.begin_ and end_ == other.end_; +} + +bool Bits::operator!=(const Bits &other) const { return !operator==(other); } + +Bits::iterator Bits::begin() const { return {*this, begin_, 0}; } + +Bits::iterator Bits::end() const { return {*this, end_, 0}; } + +Bits::iterator::iterator(const Bits &iterable, unsigned char const *const p, + const unsigned char offset) + : iterable(iterable), p(p), offset(offset) {} + +bool Bits::iterator::operator==(const Bits::iterator &other) const { + return offset == other.offset and p == other.p and iterable == other.iterable; +} + +bool Bits::iterator::operator!=(const Bits::iterator &other) const { + return !operator==(other); +} + +Bits::iterator &Bits::iterator::operator++() { + ++offset; + if (offset == CHAR_BIT) { + ++p; + offset = 0; + } + return *this; +} + +Bits::iterator Bits::iterator::operator++(int) { + const iterator tmp = *this; + operator++(); + return tmp; +} + +Bits::iterator::reference Bits::iterator::operator*() const { + // Operator precedence: dereference, then left shift, then bitwise AND. + return *p << offset & 0x80; +} + +} // namespace mgard diff --git a/tests/src/test_utilities.cpp b/tests/src/test_utilities.cpp index 3d665e331e..31813102fd 100644 --- a/tests/src/test_utilities.cpp +++ b/tests/src/test_utilities.cpp @@ -171,3 +171,61 @@ TEST_CASE("CartesianProduct predecessors and successors", "[utilities]") { REQUIRE(tracker); } + +namespace { + +void test_bit_equality(const mgard::Bits &bits, + const std::vector &expected) { + TrialTracker tracker; + std::vector::const_iterator p = expected.begin(); + for (const bool b : bits) { + tracker += b == *p++; + } + REQUIRE(tracker); +} + +} // namespace + +TEST_CASE("Bits iteration", "[utilities]") { + SECTION("zero end offsets") { + { + unsigned char const a[1]{0x3d}; + const mgard::Bits bits(a, a + 1); + const std::vector expected{// `3`. + false, false, true, true, + // `d`. + true, true, false, true}; + test_bit_equality(bits, expected); + } + { + unsigned char const a[2]{0xe6, 0x0a}; + const mgard::Bits bits(a, a + 2); + const std::vector expected{// `e`. + true, true, true, false, + // `6`. + false, true, true, false, + // `0`. + false, false, false, false, + // `a`. + true, false, true, false}; + test_bit_equality(bits, expected); + } + { + unsigned char const a[3]{0x12, 0x0c, 0xff}; + const mgard::Bits bits(a, a + 3); + const std::vector expected{// `1`. + false, false, false, true, + // `2`. + false, false, true, false, + // `0`. + false, false, false, false, + // `c`. + true, true, false, false, + // `f`. + true, true, true, true, + // `f`. + true, true, true, true}; + test_bit_equality(bits, expected); + } + } +} From 24f305264dec2bf474e73018f68a99a564aeb630 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 18 May 2022 16:42:04 -0400 Subject: [PATCH 12/58] Allow nonzero end bit offsets in `Bits`. --- include/utilities.hpp | 13 +++++++++++++ src/utilities.cpp | 16 +++++++++++++--- tests/src/test_utilities.cpp | 24 ++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/include/utilities.hpp b/include/utilities.hpp index 05b625a950..9f514ef472 100644 --- a/include/utilities.hpp +++ b/include/utilities.hpp @@ -461,6 +461,16 @@ class Bits { //!\param end Pointer to the end of the array to be iterated over. Bits(unsigned char const *const begin, unsigned char const *const end); + //! Constructor. + //! + //!\overload + //! + //!\param begin Pointer to the beginning of the array to be iterated over. + //!\param end Pointer to the end of the array to be iterated over. + //!\param offset_end Offset for end iterator. + Bits(unsigned char const *const begin, unsigned char const *const end, + const unsigned char offset_end); + //! Equality comparison. bool operator==(const Bits &other) const; @@ -482,6 +492,9 @@ class Bits { //! Pointer to the beginning of the array to be iterated over. unsigned char const *end_; + + //! Offset for end iterator. + unsigned char offset_end; }; //! Iterator over a bit range. diff --git a/src/utilities.cpp b/src/utilities.cpp index 4c3aec863a..1c5afeb3d0 100644 --- a/src/utilities.cpp +++ b/src/utilities.cpp @@ -6,18 +6,28 @@ namespace mgard { +Bits::Bits(unsigned char const *const begin, unsigned char const *const end, + const unsigned char offset_end) + : begin_(begin), end_(end), offset_end(offset_end) { + if (offset_end >= CHAR_BIT) { + throw std::invalid_argument( + "offset must be smaller than number of bits in byte"); + } +} + Bits::Bits(unsigned char const *const begin, unsigned char const *const end) - : begin_(begin), end_(end) {} + : Bits(begin, end, 0) {} bool Bits::operator==(const Bits &other) const { - return begin_ == other.begin_ and end_ == other.end_; + return begin_ == other.begin_ and end_ == other.end_ and + offset_end == other.offset_end; } bool Bits::operator!=(const Bits &other) const { return !operator==(other); } Bits::iterator Bits::begin() const { return {*this, begin_, 0}; } -Bits::iterator Bits::end() const { return {*this, end_, 0}; } +Bits::iterator Bits::end() const { return {*this, end_, offset_end}; } Bits::iterator::iterator(const Bits &iterable, unsigned char const *const p, const unsigned char offset) diff --git a/tests/src/test_utilities.cpp b/tests/src/test_utilities.cpp index 31813102fd..1e53eec72e 100644 --- a/tests/src/test_utilities.cpp +++ b/tests/src/test_utilities.cpp @@ -228,4 +228,28 @@ TEST_CASE("Bits iteration", "[utilities]") { test_bit_equality(bits, expected); } } + SECTION("nonzero end offsets") { + { + unsigned char const a[1]{0xff}; + const mgard::Bits bits(a, a, 7); + const std::vector expected(7, true); + test_bit_equality(bits, expected); + } + { + unsigned char const a[2]{0xa9, 0x33}; + const mgard::Bits bits(a, a + 1, 2); + const std::vector expected{true, false, true, false, true, + false, false, true, false, false}; + test_bit_equality(bits, expected); + } + { + unsigned char const a[3]{0x1e, 0x0f, 0x77}; + const mgard::Bits bits(a, a + 2, 6); + const std::vector expected{false, false, false, true, true, true, + true, false, false, false, false, false, + true, true, true, true, false, true, + true, true, false, true}; + test_bit_equality(bits, expected); + } + } } From 8888b565a2e01d0314376d4dc10569e129487bd9 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Fri, 27 May 2022 20:44:09 -0400 Subject: [PATCH 13/58] Add Huffman encoding regression tests. --- include/huffman.hpp | 14 +++++ src/compressors.cpp | 2 +- src/huffman.cpp | 73 ++++++++++++++++------ tests/CMakeLists.txt | 1 + tests/src/test_huffman.cpp | 122 +++++++++++++++++++++++++++++++++++++ 5 files changed, 193 insertions(+), 19 deletions(-) create mode 100644 tests/src/test_huffman.cpp diff --git a/include/huffman.hpp b/include/huffman.hpp index 5705c17996..0ee06103d2 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -3,8 +3,22 @@ //!\file //!\brief Huffman trees for quantized multilevel coefficients. +#include + namespace mgard { +//! Encode quantized coefficients using a Huffman code. +//! +//!\param[in, out] quantized_data Input buffer (quantized coefficients). This +//! buffer will be changed by the encoding process. +//\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. +//!\param[out] out_data_hit Pointer to compressed buffer. +//!\param[out] Size *in bits* of compressed buffer. +//!\param[out] Pointer to 'missed' buffer (input symbols not assigned codes). +//!\param[out] Size *in bytes* of 'missed' buffer. +//!\param[out] Frequency table for input buffer. +//!\param[out] Size *in bytes* of the frequency table. void huffman_encoding(long int *const quantized_data, const std::size_t n, unsigned char *&out_data_hit, size_t &out_data_hit_size, unsigned char *&out_data_miss, size_t &out_data_miss_size, diff --git a/src/compressors.cpp b/src/compressors.cpp index ec5c2323a1..05852b1ef3 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -5,7 +5,6 @@ #include #include -#include #include #include #include @@ -14,6 +13,7 @@ #include "format.hpp" #include "huffman.hpp" +#include "utilities.hpp" #ifdef MGARD_TIMING #include diff --git a/src/huffman.cpp b/src/huffman.cpp index 44f82bc341..3f0ccd18e2 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -3,9 +3,7 @@ #include #include -#ifndef NDEBUG #include -#endif #include #include @@ -17,33 +15,64 @@ namespace mgard { const int nql = 32768 * 4; +//! Node in the Huffman code creation tree. struct htree_node { //! Constructor. + //! + //!\param q (Transformed) symbol. + //!\param cnt Number of occurences of the (transformed) symbol in the source. htree_node(const int q, const std::size_t cnt) : q(q), cnt(cnt), code(0), len(0), left(nullptr), right(nullptr) {} + //! (Transformed) symbol. int q; + + //! Number of occurences of the (transformed) symbol in the source. std::size_t cnt; + + //! Codeword associated to the (transformed) symbol. unsigned int code; + + //! Length in bits of the codeword. std::size_t len; + + //! Left child in the code creation tree. htree_node *left; + + //! Right child in the code creation tree. htree_node *right; }; +//! Input symbol–Huffman code pair. struct huffman_codec { + //! (Transformed) symbol. int q; + + //! Codeword associated to the (transformed) symbol. unsigned int code; + + //! Length in bits of the codeword. std::size_t len; }; +//! Frequency table and symbol–code mappings for encoding source. template struct HuffmanCodec { // The arrays are value-initialized, which leads to each of their elements // being value-initialized (ultimately zero-initialized). + + //! Input symbol–Huffman code pairs. std::array codec{}; + + //! Frequency table for encoding source. std::array frequency_table{}; }; +//! Function object for comparing Huffman code creation nodes. struct LessThanByCnt { + //! Return whether the first node has a larger count than the second. + //! + //!\param lhs First node. + //!\param rhs Second node. bool operator()(htree_node const *const lhs, htree_node const *const rhs) const { return lhs->cnt > rhs->cnt; @@ -54,16 +83,16 @@ template using my_priority_queue = std::priority_queue, LessThanByCnt>; -template -void initialize_codec(HuffmanCodec &codec, htree_node *const root, +void initialize_codec(HuffmanCodec &codec, htree_node *const root, const unsigned int code, const std::size_t len) { - root->len = len; + std::array &codewords = codec.codec; + root->code = code; + root->len = len; if (!root->left && !root->right) { - codec.codec[root->q].q = root->q; - codec.codec[root->q].code = code; - codec.codec[root->q].len = len; + const std::size_t index = root->q; + codewords.at(index) = {root->q, code, len}; } if (root->left) { @@ -78,7 +107,6 @@ void initialize_codec(HuffmanCodec &codec, htree_node *const root, my_priority_queue *build_tree(std::size_t const *const cnt) { my_priority_queue *const phtree = new my_priority_queue; -#if 1 for (int i = 0; i < nql; i++) { if (cnt[i] != 0) { htree_node *const new_node = new htree_node(i, cnt[i]); @@ -98,7 +126,6 @@ my_priority_queue *build_tree(std::size_t const *const cnt) { new_node->right = top_node2; phtree->push(new_node); } -#endif return phtree; } @@ -126,9 +153,15 @@ void free_tree(my_priority_queue *const phtree) { } } -// Note: this function will change the quantized data. -template -void initialize_frequency_table(HuffmanCodec &codec, +//! Populate the frequency table of a `HuffmanCodec`. +//! +//!\note This function will change the quantized data. +//! +//!\param[in, out] quantized_data Input buffer (quantized coefficients). This +//! buffer will be changed by the codec-building process. +//\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. +void initialize_frequency_table(HuffmanCodec &codec, long int *const quantized_data, const std::size_t n) { assert(*std::max_element(codec.frequency_table.begin(), @@ -137,15 +170,21 @@ void initialize_frequency_table(HuffmanCodec &codec, for (std::size_t i = 0; i < n; i++) { // Convert quantization level to positive so that counting freq can be // easily done. Level 0 is reserved a out-of-range flag. - quantized_data[i] = quantized_data[i] + NQL / 2; + quantized_data[i] = quantized_data[i] + nql / 2; ++codec.frequency_table[quantized_data[i] > 0 && quantized_data[i] < - static_cast(NQL) + static_cast(nql) ? quantized_data[i] : 0]; } } +//! Build a Huffman codec for an input buffer. +//! +//!\param[in, out] quantized_data Input buffer (quantized coefficients). This +//! buffer will be changed by the codec-building process. +//\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. template HuffmanCodec build_huffman_codec(long int *const quantized_data, const std::size_t n) { @@ -186,13 +225,12 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, } out_data_hit = reinterpret_cast(p_hit); - out_data_miss = (unsigned char *)p_miss; + out_data_miss = reinterpret_cast(p_miss); out_data_hit_size = 0; out_data_miss_size = 0; std::size_t start_bit = 0; unsigned int *cur = p_hit; - std::size_t cnt_missed = 0; for (std::size_t i = 0; i < n; i++) { const int q = quantized_data[i]; unsigned int code; @@ -209,7 +247,6 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, *p_miss = q; p_miss++; - cnt_missed++; } // Note that if len == 0, then that means that either the data is all the diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1e67174fac..427b2e4546 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -20,6 +20,7 @@ set( "src/test_quantize.cpp" "src/test_compressors.cpp" "src/test_CompressedDataset.cpp" + "src/test_huffman.cpp" ) if(MGARD_ENABLE_UNSTRUCTURED AND MOAB_FOUND) diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp new file mode 100644 index 0000000000..dcae0e9d03 --- /dev/null +++ b/tests/src/test_huffman.cpp @@ -0,0 +1,122 @@ +#include "catch2/catch_test_macros.hpp" + +#include + +#include +#include + +#include "testing_utilities.hpp" + +#include "huffman.hpp" + +namespace { + +void test_encoding_regression(long int *const quantized, const std::size_t N) { + long int *const quantized_new = new long int[N]; + std::copy(quantized, quantized + N, quantized_new); + + unsigned char *hit; + unsigned char *missed; + unsigned char *frequencies; + std::size_t bits_hit; + std::size_t bytes_missed; + std::size_t bytes_frequencies; + mgard::huffman_encoding(quantized, N, hit, bits_hit, missed, bytes_missed, + frequencies, bytes_frequencies); + + unsigned char *hit_new; + unsigned char *missed_new; + unsigned char *frequencies_new; + std::size_t bits_hit_new; + std::size_t bytes_missed_new; + std::size_t bytes_frequencies_new; + mgard::huffman_encoding(quantized_new, N, hit_new, bits_hit_new, missed_new, + bytes_missed_new, frequencies_new, + bytes_frequencies_new); + + REQUIRE(bits_hit_new == bits_hit); + const std::size_t bytes_hit = (bits_hit + CHAR_BIT - 1) / CHAR_BIT; + REQUIRE(std::equal(hit, hit + bytes_hit, hit_new)); + + REQUIRE(bytes_missed_new == bytes_missed); + REQUIRE(std::equal(missed, missed + bytes_missed, missed_new)); + + REQUIRE(bytes_frequencies_new == bytes_frequencies); + REQUIRE(std::equal(frequencies, frequencies + bytes_frequencies, + frequencies_new)); + + delete[] quantized_new; +} + +void test_encoding_regression_constant(const std::size_t N, const long int q) { + long int *const quantized = new long int[N]; + std::fill(quantized, quantized + N, q); + test_encoding_regression(quantized, N); + delete[] quantized; +} + +//! Function object to generate periodict data. +struct PeriodicGenerator { + //! Constructor. + //! + //!\param value Starting value. + //!\param period Generator period. + PeriodicGenerator(const std::size_t period, const long int value) + : period(period), value(value), ncalls(0) {} + + //! Generator period. + std::size_t period; + + //! Starting value. + long int value; + + //! Number of times `operator()` has been called. + std::size_t ncalls; + + long int operator()() { + return value + static_cast(ncalls++ % period); + } +}; + +void test_encoding_regression_periodic(const std::size_t N, const long int q, + const std::size_t period) { + long int *const quantized = new long int[N]; + std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); + test_encoding_regression(quantized, N); + delete[] quantized; +} + +void test_encoding_regression_random(const std::size_t N, const long int a, + const long int b, + std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + long int *const quantized = new long int[N]; + std::generate(quantized, quantized + N, [&] { return dis(gen); }); + test_encoding_regression(quantized, N); + delete[] quantized; +} + +} // namespace + +TEST_CASE("encoding regression", "[huffman]") { + SECTION("constant data") { + test_encoding_regression_constant(10, 0); + test_encoding_regression_constant(100, 732); + test_encoding_regression_constant(1000, -10); + } + + SECTION("periodic data") { + test_encoding_regression_periodic(10, -3, 3); + test_encoding_regression_periodic(100, 0, 10); + test_encoding_regression_periodic(1000, 51, 17); + } + + SECTION("random data") { + std::default_random_engine gen(131051); + test_encoding_regression_random(10, 0, 1, gen); + test_encoding_regression_random(100, -15, -5, gen); + test_encoding_regression_random(1000, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_encoding_regression_random(10000, -100, 100, gen); + } +} From d51312735cdc451a688728b752e98cf20f4cf6db Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 31 May 2022 13:07:31 -0400 Subject: [PATCH 14/58] Reimplement Huffman encoding with `HuffmanCode`. --- include/huffman.hpp | 145 +++++++++++++++++++++++++++++-- include/huffman.tpp | 124 ++++++++++++++++++++++++++ src/huffman.cpp | 173 ++++++++++++++++++++++++++++++++++++- tests/src/test_huffman.cpp | 6 +- 4 files changed, 438 insertions(+), 10 deletions(-) create mode 100644 include/huffman.tpp diff --git a/include/huffman.hpp b/include/huffman.hpp index 0ee06103d2..ed50c8b0c7 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -5,31 +5,164 @@ #include +#include +#include + namespace mgard { //! Encode quantized coefficients using a Huffman code. //! //!\param[in, out] quantized_data Input buffer (quantized coefficients). This //! buffer will be changed by the encoding process. -//\param[in] n Number of symbols (`long int` quantized coefficients) in the +//!\param[in] n Number of symbols (`long int` quantized coefficients) in the //! input buffer. //!\param[out] out_data_hit Pointer to compressed buffer. -//!\param[out] Size *in bits* of compressed buffer. -//!\param[out] Pointer to 'missed' buffer (input symbols not assigned codes). -//!\param[out] Size *in bytes* of 'missed' buffer. -//!\param[out] Frequency table for input buffer. -//!\param[out] Size *in bytes* of the frequency table. +//!\param[out] out_data_hit_size Size *in bits* of compressed buffer. +//!\param[out] out_data_miss Pointer to 'missed' buffer (input symbols not +//! assigned codes). +//!\param[out] out_data_miss_size Size *in bytes* of 'missed' +//! buffer. +//!\param[out] out_tree Frequency table for input buffer. +//!\param[out] out_tree_size Size *in bytes* of the frequency table. void huffman_encoding(long int *const quantized_data, const std::size_t n, unsigned char *&out_data_hit, size_t &out_data_hit_size, unsigned char *&out_data_miss, size_t &out_data_miss_size, unsigned char *&out_tree, size_t &out_tree_size); +//! Encode quantized coefficients using a Huffman code. +//! +//!\param[in, out] quantized_data Input buffer (quantized coefficients). This +//! buffer will be changed by the encoding process. +//!\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. +//!\param[out] out_data_hit Pointer to compressed buffer. +//!\param[out] out_data_hit_size Size *in bits* of compressed buffer. +//!\param[out] out_data_miss Pointer to 'missed' buffer (input symbols not +//! assigned codes). +//!\param[out] out_data_miss_size Size *in bytes* of 'missed' +//! buffer. +//!\param[out] out_tree Frequency table for input buffer. +//!\param[out] out_tree_size Size *in bytes* of the frequency table. +void huffman_encoding_rewritten( + long int const *const quantized_data, const std::size_t n, + unsigned char *&out_data_hit, std::size_t &out_data_hit_size, + unsigned char *&out_data_miss, std::size_t &out_data_miss_size, + unsigned char *&out_tree, std::size_t &out_tree_size); + +//! Decode a stream encoded using a Huffman code. +//! +//!\param[out] quantized_data Output buffer (quantized coefficients). +//!\param[in] quantized_data_size Size *in bytes* of output buffer. +//!\param[in] out_data_hit Compressed buffer. +//!\param[in] out_data_hit_size Size *in bits* of compressed buffer. +//!\param[in] out_data_miss 'Missed' buffer (input symbols not assigned codes). +//!\param[in] out_data_miss_size Size *in bytes* of 'missed' buffer. +//!\param[in] out_tree Frequency table for input buffer. +//!\param[in] out_tree_size Size *in bytes* of the frequency table. void huffman_decoding( long int *const quantized_data, const std::size_t quantized_data_size, unsigned char const *const out_data_hit, const size_t out_data_hit_size, unsigned char const *const out_data_miss, const size_t out_data_miss_size, unsigned char const *const out_tree, const size_t out_tree_size); +//! Codeword (in progress) associated to a node in a Huffman code creation tree. +struct HuffmanCodeword { + //! Bytes containing the bits of the codeword. + std::vector bytes = {}; + + //! Length in bits of the codeword. + std::size_t length = 0; + + //! Append a bit to the codeword. + void push_back(const bool bit); + + //! Generate the codeword associated to the left child in the tree. + HuffmanCodeword left() const; + + //! Generate the codeword associated to the right child in the tree. + HuffmanCodeword right() const; +}; + +//! Node in a Huffman code creation tree. +struct CodeCreationTreeNode { + //! Constructor. + //! + //! Create a leaf node. + //! + //!\param codeword Associated codeword. + //!\param count Frequency of the associated symbol. + CodeCreationTreeNode(HuffmanCodeword *const codeword, + const std::size_t count); + + //! Constructor. + //! + //! Create an inner (parent) node. + //! + //!\param left Left child of the node to be created. + //!\param right Right child of the node to be created. + CodeCreationTreeNode(const std::shared_ptr &left, + const std::shared_ptr &right); + + //! Associated codeword (if this node is a leaf). + HuffmanCodeword *codeword = nullptr; + + //! Sum of frequencies of symbols associated to leaves descending from this + //! node. + std::size_t count; + + //! Left child of this node. + std::shared_ptr left; + + //! Right child of this node. + std::shared_ptr right; +}; + +//! Huffman code generated from/for an input stream. +template class HuffmanCode { +public: + //! Constructor. + //! + //!\param ncodewords Number of symbols that will be assigned codewords. + //!\param begin Beginning of input stream. + //!\param end End of output stream. + HuffmanCode(const std::size_t ncodewords, Symbol const *const begin, + Symbol const *const end); + + //! Number of symbols that will be assigned codewords. + std::size_t ncodewords; + + //! Frequencies of the symbols in the input stream. + std::vector frequencies; + + //! Codewords associated to the symbols. + std::vector codewords; + + //! Report the number of out-of-range symbols encountered in the stream. + std::size_t nmissed() const; + + //! Check whether a symbol is eligible for a codeword. + bool out_of_range(const Symbol symbol) const; + + //! Determine the codeword index for a symbol. + std::size_t index(const Symbol symbol) const; + +private: + //! Smallest symbol (inclusive) to receive a codeword. + Symbol min_symbol; + + //! Largest symbol (inclusive) to receive a codeword. + Symbol max_symbol; + + // TODO: Check that frequency count ties aren't going to hurt us here. Stable + // sorting algorithm in `priority_queue`? + + //! Set codewords for given node and descendants. + void + recursively_set_codewords(const std::shared_ptr &node, + const HuffmanCodeword codeword); +}; + } // namespace mgard +#include "huffman.tpp" #endif diff --git a/include/huffman.tpp b/include/huffman.tpp new file mode 100644 index 0000000000..da11ac5d97 --- /dev/null +++ b/include/huffman.tpp @@ -0,0 +1,124 @@ +#include "utilities.hpp" + +#include + +#include +#include +#include +#include + +namespace mgard { + +//! This is used in the instantization of `std::priority_queue`. +template struct HeldCountGreater { + bool operator()(const T &a, const T &b) const { return a->count > b->count; } +}; + +template +HuffmanCode::HuffmanCode(const std::size_t ncodewords, + Symbol const *const begin, + Symbol const *const end) + : ncodewords(ncodewords), frequencies(ncodewords), codewords(ncodewords) { + static_assert(std::is_integral::value and + std::is_signed::value, + "symbol type must be signed and integral"); + // Haven't carefully checked what the minimum acceptable value is. + if (not ncodewords) { + throw std::invalid_argument("`ncodewords` must be positive."); + } + { + const Symbol SYMBOL_MAX = std::numeric_limits::max(); + const Symbol SYMBOL_MIN = std::numeric_limits::min(); + + const std::size_t max_symbol_ = (ncodewords + 1) / 2 - 1; + const std::size_t opp_min_symbol_ = ncodewords / 2; + + // TODO: There is surely a better way of doing this. Lots of potential + // issues with directly comparing `opp_min_symbol_` and `-SYMBOL_MIN`. + // `-SYMBOL_MIN` can't necessarily be represented as a `Symbol`, for + // example. Trying to avoid overflows. + std::size_t a = opp_min_symbol_; + Symbol b = SYMBOL_MIN; + while (a) { + a /= 2; + b /= 2; + } + if (not b) { + // Only a "risk" because we haven't actually established that + // `opp_min_symbol_` is greater in magnitude than `SYMBOL_MIN`. + throw std::overflow_error( + "risk that minimum symbol cannot be represented in symbol type"); + } else if (opp_min_symbol_ > SYMBOL_MAX) { + throw std::overflow_error( + "opposite of minimum symbol canont be represented in symbol type"); + } else { + min_symbol = -static_cast(opp_min_symbol_); + } + + // `opp_min_symbol_` is either equal to or one greater than `max_symbol_`, + // and we checked above that `opp_min_symbol <= SYMBOL_MAX`. So, we know + // that `max_symbol_ <= SYMBOL_MAX` here. + max_symbol = max_symbol_; + } + for (const Symbol symbol : + RangeSlice{.begin_ = begin, .end_ = end}) { + ++frequencies.at(index(symbol)); + } + + using T = std::shared_ptr; + std::priority_queue, HeldCountGreater> queue; + + // We can't quite use a `ZippedRange` here, I think, because + // `ZippedRange::iterator` doesn't expose the underlying iterators and + // we want a pointer to the codeword. + typename std::vector::const_iterator p = frequencies.cbegin(); + HuffmanCodeword *q = codewords.data(); + for (std::size_t i = 0; i < ncodewords; ++i) { + const std::size_t count = *p; + if (count) { + queue.push(std::make_shared(q, count)); + } + ++p; + ++q; + } + while (queue.size() > 1) { + const std::shared_ptr a = queue.top(); + queue.pop(); + const std::shared_ptr b = queue.top(); + queue.pop(); + + queue.push(std::make_shared(a, b)); + } + + recursively_set_codewords(queue.top(), {}); +} + +template std::size_t HuffmanCode::nmissed() const { + return frequencies.at(0); +} + +template +bool HuffmanCode::out_of_range(const Symbol symbol) const { + return symbol < min_symbol or symbol > max_symbol; +} + +template +std::size_t HuffmanCode::index(const Symbol symbol) const { + return out_of_range(symbol) ? 0 : 1 + symbol - min_symbol; +} + +template +void HuffmanCode::recursively_set_codewords( + const std::shared_ptr &node, + const HuffmanCodeword codeword) { + const bool children = node->left; + assert(children == static_cast(node->right)); + if (children) { + recursively_set_codewords(node->left, codeword.left()); + recursively_set_codewords(node->right, codeword.right()); + } else { + *node->codeword = codeword; + } +} + +} // namespace mgard diff --git a/src/huffman.cpp b/src/huffman.cpp index 3f0ccd18e2..af45c78da8 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -6,15 +7,78 @@ #include #include +#include #include #include #include "huffman.hpp" +#include "utilities.hpp" + namespace mgard { const int nql = 32768 * 4; +struct HuffmanEncodedStream { + //! Constructor. + //! + //!\param nbits Length in bits of the compressed stream. + //!\param ncompressed Length in bits of the compressed stream. + //!\param nmissed Length in bytes of the missed array. + //!\param ntable Length in bytes of the frequency table. + HuffmanEncodedStream(const std::size_t nbits, const std::size_t ncompressed, + const std::size_t nmissed, const std::size_t ntable); + + //! Length in bits of the compressed stream. + std::size_t nbits; + + //! Compressed stream. + MemoryBuffer hit; + + //! Missed array. + MemoryBuffer missed; + + //! Frequency table. + MemoryBuffer frequencies; +}; + +HuffmanEncodedStream::HuffmanEncodedStream(const std::size_t nbits, + const std::size_t ncompressed, + const std::size_t nmissed, + const std::size_t nfrequencies) + : nbits(nbits), hit(ncompressed), missed(nmissed), + frequencies(nfrequencies) {} + +void HuffmanCodeword::push_back(const bool bit) { + const unsigned char offset = length % CHAR_BIT; + if (not offset) { + bytes.push_back(0); + } + bytes.back() |= static_cast(bit) << (CHAR_BIT - 1 - offset); + ++length; +} + +HuffmanCodeword HuffmanCodeword::left() const { + HuffmanCodeword tmp = *this; + tmp.push_back(false); + return tmp; +} + +HuffmanCodeword HuffmanCodeword::right() const { + HuffmanCodeword tmp = *this; + tmp.push_back(true); + return tmp; +} + +CodeCreationTreeNode::CodeCreationTreeNode(HuffmanCodeword *const codeword, + const std::size_t count) + : codeword(codeword), count(count) {} + +CodeCreationTreeNode::CodeCreationTreeNode( + const std::shared_ptr &left, + const std::shared_ptr &right) + : count(left->count + right->count), left(left), right(right) {} + //! Node in the Huffman code creation tree. struct htree_node { //! Constructor. @@ -165,7 +229,7 @@ void initialize_frequency_table(HuffmanCodec &codec, long int *const quantized_data, const std::size_t n) { assert(*std::max_element(codec.frequency_table.begin(), - code.frequency_table.end()) == 0); + codec.frequency_table.end()) == 0); for (std::size_t i = 0; i < n; i++) { // Convert quantization level to positive so that counting freq can be @@ -299,6 +363,113 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, out_tree_size = 2 * nonZeros * sizeof(std::size_t); } +void huffman_encoding_rewritten( + long int const *const quantized_data, const std::size_t n, + unsigned char *&out_data_hit, std::size_t &out_data_hit_size, + unsigned char *&out_data_miss, std::size_t &out_data_miss_size, + unsigned char *&out_tree, std::size_t &out_tree_size) { + const std::size_t ncodewords = nql - 1; + const HuffmanCode code(ncodewords, quantized_data, + quantized_data + n); + + std::vector lengths; + for (const HuffmanCodeword &codeword : code.codewords) { + lengths.push_back(codeword.length); + } + const std::size_t nbits = + std::inner_product(code.frequencies.begin(), code.frequencies.end(), + lengths.begin(), static_cast(0)); + const std::size_t nbytes = + sizeof(unsigned int) * ((nbits + CHAR_BIT * sizeof(unsigned int) - 1) / + (CHAR_BIT * sizeof(unsigned int))); + if (nbytes % sizeof(unsigned int)) { + throw std::runtime_error( + "`nbytes` not bumped up to nearest multiple of `unsigned int` size"); + } + + const std::size_t nnz = ncodewords - std::count(code.frequencies.begin(), + code.frequencies.end(), 0); + + HuffmanEncodedStream out(nbits, nbytes, code.nmissed() * sizeof(int), + 2 * nnz * sizeof(std::size_t)); + + // Write frequency table. + { + std::size_t *p = + reinterpret_cast(out.frequencies.data.get()); + const std::vector &frequencies = code.frequencies; + for (std::size_t i = 0; i < ncodewords; ++i) { + const std::size_t frequency = frequencies.at(i); + if (frequency) { + *p++ = i; + *p++ = frequency; + } + } + } + + unsigned char *const buffer = out.hit.data.get(); + { + unsigned char *const p = out.hit.data.get(); + std::fill(p, p + out.hit.size, 0); + } + unsigned char *hit = buffer; + + int *missed = reinterpret_cast(out.missed.data.get()); + + unsigned char offset = 0; + for (const long int q : PseudoArray(quantized_data, n)) { + if (code.out_of_range(q)) { + // Remember that `missed` is an `int` rather than a `long int`. + *missed++ = q + nql / 2; + } + + const HuffmanCodeword codeword = code.codewords.at(code.index(q)); + std::size_t NREMAINING = codeword.length; + for (unsigned char byte : codeword.bytes) { + // Number of bits of `byte` left to write. + unsigned char nremaining = + std::min(static_cast(CHAR_BIT), NREMAINING); + // Premature, but this will hold when we're done with `byte`. + NREMAINING -= nremaining; + + while (nremaining) { + *hit |= byte >> offset; + // Number of bits of `byte` just written (not cumulative). + const unsigned char nwritten = std::min( + nremaining, static_cast( + static_cast(CHAR_BIT) - offset)); + offset += nwritten; + hit += offset / CHAR_BIT; + offset %= CHAR_BIT; + nremaining -= nwritten; + byte <<= nwritten; + } + } + } + + { + const unsigned int one{1}; + const bool little_endian = *reinterpret_cast(&one); + if (little_endian) { + for (std::size_t i = 0; i < nbytes; i += sizeof(unsigned int)) { + unsigned char *a = buffer + i; + unsigned char *b = a + sizeof(unsigned int) - 1; + for (std::size_t j = 0; j < sizeof(unsigned int) / 2; ++j) { + std::swap(*a++, *b--); + } + } + } + } + + out_data_hit_size = out.nbits; + out_data_miss_size = out.missed.size; + out_tree_size = out.frequencies.size; + + out_data_hit = out.hit.data.release(); + out_data_miss = out.missed.data.release(); + out_tree = out.frequencies.data.release(); +} + void huffman_decoding(long int *const quantized_data, const std::size_t quantized_data_size, unsigned char const *const out_data_hit, diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index dcae0e9d03..4efecf5474 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -30,9 +30,9 @@ void test_encoding_regression(long int *const quantized, const std::size_t N) { std::size_t bits_hit_new; std::size_t bytes_missed_new; std::size_t bytes_frequencies_new; - mgard::huffman_encoding(quantized_new, N, hit_new, bits_hit_new, missed_new, - bytes_missed_new, frequencies_new, - bytes_frequencies_new); + mgard::huffman_encoding_rewritten(quantized_new, N, hit_new, bits_hit_new, + missed_new, bytes_missed_new, + frequencies_new, bytes_frequencies_new); REQUIRE(bits_hit_new == bits_hit); const std::size_t bytes_hit = (bits_hit + CHAR_BIT - 1) / CHAR_BIT; From 44cdc1271f2fa4645da3d8ec25b9403989ea77a1 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 31 May 2022 15:53:33 -0400 Subject: [PATCH 15/58] Return struct from rewritten Huffman encoder. --- include/huffman.hpp | 45 +++++++++++++++++++++++++------------- src/huffman.cpp | 41 ++++------------------------------ tests/src/test_huffman.cpp | 25 ++++++++------------- 3 files changed, 43 insertions(+), 68 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index ed50c8b0c7..564272d7f0 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -8,8 +8,34 @@ #include #include +#include "utilities.hpp" + namespace mgard { +//! A stream compressed using a Huffman code. +struct HuffmanEncodedStream { + //! Constructor. + //! + //!\param nbits Length in bits of the compressed stream. + //!\param ncompressed Length in bytes of the compressed stream. + //!\param nmissed Length in bytes of the missed array. + //!\param ntable Length in bytes of the frequency table. + HuffmanEncodedStream(const std::size_t nbits, const std::size_t ncompressed, + const std::size_t nmissed, const std::size_t ntable); + + //! Length in bits of the compressed stream. + std::size_t nbits; + + //! Compressed stream. + MemoryBuffer hit; + + //! Missed array. + MemoryBuffer missed; + + //! Frequency table. + MemoryBuffer frequencies; +}; + //! Encode quantized coefficients using a Huffman code. //! //!\param[in, out] quantized_data Input buffer (quantized coefficients). This @@ -31,23 +57,12 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, //! Encode quantized coefficients using a Huffman code. //! -//!\param[in, out] quantized_data Input buffer (quantized coefficients). This -//! buffer will be changed by the encoding process. +//!\param[in] quantized_data Input buffer (quantized coefficients). //!\param[in] n Number of symbols (`long int` quantized coefficients) in the //! input buffer. -//!\param[out] out_data_hit Pointer to compressed buffer. -//!\param[out] out_data_hit_size Size *in bits* of compressed buffer. -//!\param[out] out_data_miss Pointer to 'missed' buffer (input symbols not -//! assigned codes). -//!\param[out] out_data_miss_size Size *in bytes* of 'missed' -//! buffer. -//!\param[out] out_tree Frequency table for input buffer. -//!\param[out] out_tree_size Size *in bytes* of the frequency table. -void huffman_encoding_rewritten( - long int const *const quantized_data, const std::size_t n, - unsigned char *&out_data_hit, std::size_t &out_data_hit_size, - unsigned char *&out_data_miss, std::size_t &out_data_miss_size, - unsigned char *&out_tree, std::size_t &out_tree_size); +HuffmanEncodedStream +huffman_encoding_rewritten(long int const *const quantized_data, + const std::size_t n); //! Decode a stream encoded using a Huffman code. //! diff --git a/src/huffman.cpp b/src/huffman.cpp index af45c78da8..1983a863d5 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -13,35 +13,10 @@ #include "huffman.hpp" -#include "utilities.hpp" - namespace mgard { const int nql = 32768 * 4; -struct HuffmanEncodedStream { - //! Constructor. - //! - //!\param nbits Length in bits of the compressed stream. - //!\param ncompressed Length in bits of the compressed stream. - //!\param nmissed Length in bytes of the missed array. - //!\param ntable Length in bytes of the frequency table. - HuffmanEncodedStream(const std::size_t nbits, const std::size_t ncompressed, - const std::size_t nmissed, const std::size_t ntable); - - //! Length in bits of the compressed stream. - std::size_t nbits; - - //! Compressed stream. - MemoryBuffer hit; - - //! Missed array. - MemoryBuffer missed; - - //! Frequency table. - MemoryBuffer frequencies; -}; - HuffmanEncodedStream::HuffmanEncodedStream(const std::size_t nbits, const std::size_t ncompressed, const std::size_t nmissed, @@ -363,11 +338,9 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, out_tree_size = 2 * nonZeros * sizeof(std::size_t); } -void huffman_encoding_rewritten( - long int const *const quantized_data, const std::size_t n, - unsigned char *&out_data_hit, std::size_t &out_data_hit_size, - unsigned char *&out_data_miss, std::size_t &out_data_miss_size, - unsigned char *&out_tree, std::size_t &out_tree_size) { +HuffmanEncodedStream +huffman_encoding_rewritten(long int const *const quantized_data, + const std::size_t n) { const std::size_t ncodewords = nql - 1; const HuffmanCode code(ncodewords, quantized_data, quantized_data + n); @@ -461,13 +434,7 @@ void huffman_encoding_rewritten( } } - out_data_hit_size = out.nbits; - out_data_miss_size = out.missed.size; - out_tree_size = out.frequencies.size; - - out_data_hit = out.hit.data.release(); - out_data_miss = out.missed.data.release(); - out_tree = out.frequencies.data.release(); + return out; } void huffman_decoding(long int *const quantized_data, diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index 4efecf5474..55fa93c095 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -24,26 +24,19 @@ void test_encoding_regression(long int *const quantized, const std::size_t N) { mgard::huffman_encoding(quantized, N, hit, bits_hit, missed, bytes_missed, frequencies, bytes_frequencies); - unsigned char *hit_new; - unsigned char *missed_new; - unsigned char *frequencies_new; - std::size_t bits_hit_new; - std::size_t bytes_missed_new; - std::size_t bytes_frequencies_new; - mgard::huffman_encoding_rewritten(quantized_new, N, hit_new, bits_hit_new, - missed_new, bytes_missed_new, - frequencies_new, bytes_frequencies_new); - - REQUIRE(bits_hit_new == bits_hit); + const mgard::HuffmanEncodedStream out_new = + mgard::huffman_encoding_rewritten(quantized_new, N); + + REQUIRE(out_new.nbits == bits_hit); const std::size_t bytes_hit = (bits_hit + CHAR_BIT - 1) / CHAR_BIT; - REQUIRE(std::equal(hit, hit + bytes_hit, hit_new)); + REQUIRE(std::equal(hit, hit + bytes_hit, out_new.hit.data.get())); - REQUIRE(bytes_missed_new == bytes_missed); - REQUIRE(std::equal(missed, missed + bytes_missed, missed_new)); + REQUIRE(out_new.missed.size == bytes_missed); + REQUIRE(std::equal(missed, missed + bytes_missed, out_new.missed.data.get())); - REQUIRE(bytes_frequencies_new == bytes_frequencies); + REQUIRE(out_new.frequencies.size == bytes_frequencies); REQUIRE(std::equal(frequencies, frequencies + bytes_frequencies, - frequencies_new)); + out_new.frequencies.data.get())); delete[] quantized_new; } From 38a4d96aae0f45db94307f2e5104d4ee462f6056 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 1 Jun 2022 11:10:37 -0400 Subject: [PATCH 16/58] Return struct from original Huffman encoder. --- include/huffman.hpp | 14 ++------------ src/compressors.cpp | 22 +++++++++++----------- src/huffman.cpp | 34 ++++++++++++++++++++++------------ tests/src/test_huffman.cpp | 29 +++++++++++++---------------- 4 files changed, 48 insertions(+), 51 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 564272d7f0..58a47d02b5 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -42,18 +42,8 @@ struct HuffmanEncodedStream { //! buffer will be changed by the encoding process. //!\param[in] n Number of symbols (`long int` quantized coefficients) in the //! input buffer. -//!\param[out] out_data_hit Pointer to compressed buffer. -//!\param[out] out_data_hit_size Size *in bits* of compressed buffer. -//!\param[out] out_data_miss Pointer to 'missed' buffer (input symbols not -//! assigned codes). -//!\param[out] out_data_miss_size Size *in bytes* of 'missed' -//! buffer. -//!\param[out] out_tree Frequency table for input buffer. -//!\param[out] out_tree_size Size *in bytes* of the frequency table. -void huffman_encoding(long int *const quantized_data, const std::size_t n, - unsigned char *&out_data_hit, size_t &out_data_hit_size, - unsigned char *&out_data_miss, size_t &out_data_miss_size, - unsigned char *&out_tree, size_t &out_tree_size); +HuffmanEncodedStream huffman_encoding(long int *const quantized_data, + const std::size_t n); //! Encode quantized coefficients using a Huffman code. //! diff --git a/src/compressors.cpp b/src/compressors.cpp index 05852b1ef3..2fc2c0147f 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -70,17 +70,17 @@ void decompress_memory_huffman(unsigned char *const src, MemoryBuffer compress_memory_huffman(long int *const src, const std::size_t srcLen) { - unsigned char *out_data_hit = 0; - size_t out_data_hit_size; - unsigned char *out_data_miss = 0; - size_t out_data_miss_size; - unsigned char *out_tree = 0; - size_t out_tree_size; #ifdef MGARD_TIMING auto huff_time1 = std::chrono::high_resolution_clock::now(); #endif - huffman_encoding(src, srcLen, out_data_hit, out_data_hit_size, out_data_miss, - out_data_miss_size, out_tree, out_tree_size); + HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); + const std::size_t out_data_hit_size = encoded.nbits; + const std::size_t out_data_miss_size = encoded.missed.size; + const std::size_t out_tree_size = encoded.frequencies.size; + unsigned char const *const out_data_hit = encoded.hit.data.release(); + unsigned char const *const out_data_miss = encoded.missed.data.release(); + unsigned char const *const out_tree = encoded.frequencies.data.release(); + #ifdef MGARD_TIMING auto huff_time2 = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast( @@ -106,9 +106,9 @@ MemoryBuffer compress_memory_huffman(long int *const src, bufp += out_data_miss_size; } - free(out_tree); - free(out_data_hit); - free(out_data_miss); + delete[] out_data_hit; + delete[] out_data_miss; + delete[] out_tree; #ifndef MGARD_ZSTD #ifdef MGARD_TIMING diff --git a/src/huffman.cpp b/src/huffman.cpp index 1983a863d5..3c5fbb5a77 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -240,12 +240,8 @@ HuffmanCodec build_huffman_codec(long int *const quantized_data, return codec; } -void huffman_encoding(long int *const quantized_data, const std::size_t n, - unsigned char *&out_data_hit, - std::size_t &out_data_hit_size, - unsigned char *&out_data_miss, - std::size_t &out_data_miss_size, unsigned char *&out_tree, - std::size_t &out_tree_size) { +HuffmanEncodedStream huffman_encoding(long int *const quantized_data, + const std::size_t n) { const HuffmanCodec codec = build_huffman_codec(quantized_data, n); const std::size_t num_miss = codec.frequency_table[0]; @@ -263,10 +259,12 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, p_miss = new int[num_miss](); } - out_data_hit = reinterpret_cast(p_hit); - out_data_miss = reinterpret_cast(p_miss); - out_data_hit_size = 0; - out_data_miss_size = 0; + unsigned char const *const out_data_hit = + reinterpret_cast(p_hit); + unsigned char const *const out_data_miss = + reinterpret_cast(p_miss); + std::size_t out_data_hit_size = 0; + std::size_t out_data_miss_size = 0; std::size_t start_bit = 0; unsigned int *cur = p_hit; @@ -334,8 +332,20 @@ void huffman_encoding(long int *const quantized_data, const std::size_t n, } } - out_tree = (unsigned char *)cft; - out_tree_size = 2 * nonZeros * sizeof(std::size_t); + unsigned char const *const out_tree = (unsigned char *)cft; + const std::size_t out_tree_size = 2 * nonZeros * sizeof(std::size_t); + + const std::size_t nbytes = + sizeof(unsigned int) * + ((out_data_hit_size + CHAR_BIT * sizeof(unsigned int) - 1) / + (CHAR_BIT * sizeof(unsigned int))); + HuffmanEncodedStream out(out_data_hit_size, nbytes, out_data_miss_size, + out_tree_size); + std::copy(out_data_hit, out_data_hit + nbytes, out.hit.data.get()); + std::copy(out_data_miss, out_data_miss + out_data_miss_size, + out.missed.data.get()); + std::copy(out_tree, out_tree + out_tree_size, out.frequencies.data.get()); + return out; } HuffmanEncodedStream diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index 55fa93c095..2b31eb4c9e 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -15,27 +15,24 @@ void test_encoding_regression(long int *const quantized, const std::size_t N) { long int *const quantized_new = new long int[N]; std::copy(quantized, quantized + N, quantized_new); - unsigned char *hit; - unsigned char *missed; - unsigned char *frequencies; - std::size_t bits_hit; - std::size_t bytes_missed; - std::size_t bytes_frequencies; - mgard::huffman_encoding(quantized, N, hit, bits_hit, missed, bytes_missed, - frequencies, bytes_frequencies); - + const mgard::HuffmanEncodedStream out = mgard::huffman_encoding(quantized, N); const mgard::HuffmanEncodedStream out_new = mgard::huffman_encoding_rewritten(quantized_new, N); - REQUIRE(out_new.nbits == bits_hit); - const std::size_t bytes_hit = (bits_hit + CHAR_BIT - 1) / CHAR_BIT; - REQUIRE(std::equal(hit, hit + bytes_hit, out_new.hit.data.get())); + unsigned char const *const hit = out.hit.data.get(); + REQUIRE(out_new.nbits == out.nbits); + const std::size_t nbytes = (out.nbits + CHAR_BIT - 1) / CHAR_BIT; + REQUIRE(std::equal(hit, hit + nbytes, out_new.hit.data.get())); - REQUIRE(out_new.missed.size == bytes_missed); - REQUIRE(std::equal(missed, missed + bytes_missed, out_new.missed.data.get())); + unsigned char const *const missed = out.missed.data.get(); + const std::size_t nmissed = out.missed.size; + REQUIRE(out_new.missed.size == nmissed); + REQUIRE(std::equal(missed, missed + nmissed, out_new.missed.data.get())); - REQUIRE(out_new.frequencies.size == bytes_frequencies); - REQUIRE(std::equal(frequencies, frequencies + bytes_frequencies, + unsigned char const *const frequencies = out.frequencies.data.get(); + const std::size_t nfrequencies = out.frequencies.size; + REQUIRE(out_new.frequencies.size == nfrequencies); + REQUIRE(std::equal(frequencies, frequencies + nfrequencies, out_new.frequencies.data.get())); delete[] quantized_new; From 83f31e0a2b1a7eba9a2596a72171906ea8ecfe37 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 1 Jun 2022 11:51:00 -0400 Subject: [PATCH 17/58] Avoid buffer copies in `huffman_encoding`. --- src/huffman.cpp | 101 ++++++++++++++++++------------------------------ 1 file changed, 38 insertions(+), 63 deletions(-) diff --git a/src/huffman.cpp b/src/huffman.cpp index 3c5fbb5a77..937c5d9b6a 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -247,27 +247,40 @@ HuffmanEncodedStream huffman_encoding(long int *const quantized_data, assert(n >= num_miss); - /* For those miss points, we still need to maintain a flag (q = 0), - * and therefore we need to allocate space for n numbers. - */ - // The elements of the array are value-initialized (here, zero-initialized). - unsigned int *const p_hit = new unsigned int[n](); - - int *p_miss = nullptr; - if (num_miss > 0) { - // The elements of the array are value-initialized (here, zero-initialized). - p_miss = new int[num_miss](); + std::size_t nnz = 0; + std::size_t nbits = 0; + for (std::size_t i = 0; i < nql; ++i) { + const huffman_codec &codec_ = codec.codec.at(i); + const std::size_t frequency = codec.frequency_table.at(i); + nbits += frequency * codec_.len; + nnz += frequency ? 1 : 0; } - unsigned char const *const out_data_hit = - reinterpret_cast(p_hit); - unsigned char const *const out_data_miss = - reinterpret_cast(p_miss); - std::size_t out_data_hit_size = 0; - std::size_t out_data_miss_size = 0; + const std::size_t nbytes = + sizeof(unsigned int) * ((nbits + CHAR_BIT * sizeof(unsigned int) - 1) / + (CHAR_BIT * sizeof(unsigned int))); + HuffmanEncodedStream out(nbits, nbytes, num_miss * sizeof(int), + 2 * nnz * sizeof(std::size_t)); + + unsigned int *const hit = + reinterpret_cast(out.hit.data.get()); + std::fill(hit, hit + nbytes / sizeof(unsigned int), 0u); + + int *missed = reinterpret_cast(out.missed.data.get()); + + // write frequency table to buffer + std::size_t *const cft = + reinterpret_cast(out.frequencies.data.get()); + std::size_t off = 0; + for (std::size_t i = 0; i < nql; ++i) { + if (codec.frequency_table[i] > 0) { + cft[2 * off] = i; + cft[2 * off + 1] = codec.frequency_table[i]; + off++; + } + } std::size_t start_bit = 0; - unsigned int *cur = p_hit; for (std::size_t i = 0; i < n; i++) { const int q = quantized_data[i]; unsigned int code; @@ -282,8 +295,7 @@ HuffmanEncodedStream huffman_encoding(long int *const quantized_data, code = codec.codec[0].code; len = codec.codec[0].len; - *p_miss = q; - p_miss++; + *missed++ = q; } // Note that if len == 0, then that means that either the data is all the @@ -297,54 +309,17 @@ HuffmanEncodedStream huffman_encoding(long int *const quantized_data, // and copy the rest len - (32 - start_bit % 32) to the next int const std::size_t rshift = len - (32 - start_bit % 32); const std::size_t lshift = 32 - rshift; - *(cur + start_bit / 32) = (*(cur + start_bit / 32)) | (code >> rshift); - *(cur + start_bit / 32 + 1) = - (*(cur + start_bit / 32 + 1)) | (code << lshift); - start_bit += len; - } else if (len > 0) { + *(hit + start_bit / 32) = (*(hit + start_bit / 32)) | (code >> rshift); + *(hit + start_bit / 32 + 1) = + (*(hit + start_bit / 32 + 1)) | (code << lshift); + } else if (len) { code = code << (32 - start_bit % 32 - len); - *(cur + start_bit / 32) = (*(cur + start_bit / 32)) | code; - start_bit += len; - } else { - // Sequence is empty (everything must be the same). Do nothing. - } - } - - // Note: hit size is in bits, while miss size is in bytes. - out_data_hit_size = start_bit; - out_data_miss_size = num_miss * sizeof(int); - - // write frequency table to buffer - int nonZeros = 0; - for (int i = 0; i < nql; i++) { - if (codec.frequency_table[i] > 0) { - nonZeros++; - } - } - - std::size_t *const cft = new std::size_t[2 * nonZeros]; - int off = 0; - for (int i = 0; i < nql; i++) { - if (codec.frequency_table[i] > 0) { - cft[2 * off] = i; - cft[2 * off + 1] = codec.frequency_table[i]; - off++; + *(hit + start_bit / 32) = (*(hit + start_bit / 32)) | code; } + // No effect if `len == 0`. + start_bit += len; } - unsigned char const *const out_tree = (unsigned char *)cft; - const std::size_t out_tree_size = 2 * nonZeros * sizeof(std::size_t); - - const std::size_t nbytes = - sizeof(unsigned int) * - ((out_data_hit_size + CHAR_BIT * sizeof(unsigned int) - 1) / - (CHAR_BIT * sizeof(unsigned int))); - HuffmanEncodedStream out(out_data_hit_size, nbytes, out_data_miss_size, - out_tree_size); - std::copy(out_data_hit, out_data_hit + nbytes, out.hit.data.get()); - std::copy(out_data_miss, out_data_miss + out_data_miss_size, - out.missed.data.get()); - std::copy(out_tree, out_tree + out_tree_size, out.frequencies.data.get()); return out; } From c52d44e60b875b7db344b80017b8dbbe5d62ae82 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 2 Jun 2022 16:18:50 -0400 Subject: [PATCH 18/58] Separately copy hit buffer, trailing zero bytes. --- src/compressors.cpp | 70 ++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/src/compressors.cpp b/src/compressors.cpp index 2fc2c0147f..bf5bd7c57f 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -46,8 +46,8 @@ void decompress_memory_huffman(unsigned char *const src, out_data_miss_size = *(size_t *)buf; buf += sizeof(size_t); - size_t total_huffman_size = - out_tree_size + out_data_hit_size / 8 + 4 + out_data_miss_size; + size_t total_huffman_size = out_tree_size + out_data_hit_size / CHAR_BIT + + sizeof(unsigned int) + out_data_miss_size; unsigned char *huffman_encoding_p = (unsigned char *)malloc(total_huffman_size); #ifndef MGARD_ZSTD @@ -59,8 +59,8 @@ void decompress_memory_huffman(unsigned char *const src, #endif out_tree = huffman_encoding_p; out_data_hit = huffman_encoding_p + out_tree_size; - out_data_miss = - huffman_encoding_p + out_tree_size + out_data_hit_size / 8 + 4; + out_data_miss = huffman_encoding_p + out_tree_size + + out_data_hit_size / CHAR_BIT + sizeof(unsigned int); huffman_decoding(dst, dstLen, out_data_hit, out_data_hit_size, out_data_miss, out_data_miss_size, out_tree, out_tree_size); @@ -74,12 +74,8 @@ MemoryBuffer compress_memory_huffman(long int *const src, auto huff_time1 = std::chrono::high_resolution_clock::now(); #endif HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); - const std::size_t out_data_hit_size = encoded.nbits; - const std::size_t out_data_miss_size = encoded.missed.size; - const std::size_t out_tree_size = encoded.frequencies.size; - unsigned char const *const out_data_hit = encoded.hit.data.release(); - unsigned char const *const out_data_miss = encoded.missed.data.release(); - unsigned char const *const out_tree = encoded.frequencies.data.release(); + + assert(not(encoded.hit.size % sizeof(unsigned int))); #ifdef MGARD_TIMING auto huff_time2 = std::chrono::high_resolution_clock::now(); @@ -88,34 +84,44 @@ MemoryBuffer compress_memory_huffman(long int *const src, std::cout << "Huffman tree time = " << (double)duration.count() / 1000000 << "\n"; #endif - const size_t total_size = - out_data_hit_size / 8 + 4 + out_data_miss_size + out_tree_size; - unsigned char *payload = (unsigned char *)malloc(total_size); + static_assert(CHAR_BIT == 8, "code written assuming `CHAR_BIT == 8`"); + static_assert(sizeof(unsigned int) == 4, + "code written assuming `sizeof(unsigned int) == 4`"); + const std::size_t offset = encoded.nbits % (CHAR_BIT * sizeof(unsigned int)); + // Number of hit buffer padding bytes. + const std::size_t nhpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); + + assert(encoded.hit.size + nhpb == + encoded.nbits / CHAR_BIT + sizeof(unsigned int)); + + const size_t npayload = + encoded.hit.size + nhpb + encoded.missed.size + encoded.frequencies.size; + unsigned char *const payload = new unsigned char[npayload]; unsigned char *bufp = payload; - if (out_tree_size) { - std::memcpy(bufp, out_tree, out_tree_size); - bufp += out_tree_size; - } + std::memcpy(bufp, encoded.frequencies.data.get(), encoded.frequencies.size); + bufp += encoded.frequencies.size; - std::memcpy(bufp, out_data_hit, out_data_hit_size / 8 + 4); - bufp += out_data_hit_size / 8 + 4; + std::memcpy(bufp, encoded.hit.data.get(), encoded.hit.size); + bufp += encoded.hit.size; - if (out_data_miss_size) { - std::memcpy(bufp, out_data_miss, out_data_miss_size); - bufp += out_data_miss_size; + { + const unsigned char zero{0}; + for (std::size_t i = 0; i < nhpb; ++i) { + std::memcpy(bufp, &zero, 1); + bufp += 1; + } } - delete[] out_data_hit; - delete[] out_data_miss; - delete[] out_tree; + std::memcpy(bufp, encoded.missed.data.get(), encoded.missed.size); + bufp += encoded.missed.size; #ifndef MGARD_ZSTD #ifdef MGARD_TIMING auto z_time1 = std::chrono::high_resolution_clock::now(); #endif const MemoryBuffer out_data = - compress_memory_z(payload, total_size); + compress_memory_z(payload, npayload); #ifdef MGARD_TIMING auto z_time2 = std::chrono::high_resolution_clock::now(); auto z_duration = @@ -128,7 +134,7 @@ MemoryBuffer compress_memory_huffman(long int *const src, auto zstd_time1 = std::chrono::high_resolution_clock::now(); #endif const MemoryBuffer out_data = - compress_memory_zstd(payload, total_size); + compress_memory_zstd(payload, npayload); #ifdef MGARD_TIMING auto zstd_time2 = std::chrono::high_resolution_clock::now(); auto zstd_duration = std::chrono::duration_cast( @@ -137,20 +143,20 @@ MemoryBuffer compress_memory_huffman(long int *const src, << (double)zstd_duration.count() / 1000000 << "\n"; #endif #endif - free(payload); - payload = 0; + delete[] payload; + bufp = nullptr; const std::size_t bufferLen = 3 * sizeof(size_t) + out_data.size; unsigned char *const buffer = new unsigned char[bufferLen]; bufp = buffer; - *(size_t *)bufp = out_tree_size; + *(size_t *)bufp = encoded.frequencies.size; bufp += sizeof(size_t); - *(size_t *)bufp = out_data_hit_size; + *(size_t *)bufp = encoded.nbits; bufp += sizeof(size_t); - *(size_t *)bufp = out_data_miss_size; + *(size_t *)bufp = encoded.missed.size; bufp += sizeof(size_t); { From 7a757a628e15f9b9494ce696db3888ff027a23bf Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 2 Jun 2022 13:19:22 -0400 Subject: [PATCH 19/58] Add Huffman compression regression tests. --- tests/include/testing_utilities.hpp | 21 +++++++++ tests/src/test_compressors.cpp | 67 +++++++++++++++++++++++++++++ tests/src/test_huffman.cpp | 23 ---------- tests/src/testing_utilities.cpp | 8 ++++ 4 files changed, 96 insertions(+), 23 deletions(-) diff --git a/tests/include/testing_utilities.hpp b/tests/include/testing_utilities.hpp index 318d521d6d..4b8343d783 100644 --- a/tests/include/testing_utilities.hpp +++ b/tests/include/testing_utilities.hpp @@ -61,5 +61,26 @@ mgard::TensorMeshHierarchy make_flat_hierarchy(const mgard::TensorMeshHierarchy &hierarchy, const std::array shape); +//! Function object to generate periodic data. +struct PeriodicGenerator { + //! Constructor. + //! + //!\param value Starting value. + //!\param period Generator period. + PeriodicGenerator(const std::size_t period, const long int value); + + //! Generator period. + std::size_t period; + + //! Starting value. + long int value; + + //! Number of times `operator()` has been called. + std::size_t ncalls; + + //! Generate next value in periodic sequence. + long int operator()(); +}; + #include "testing_utilities.tpp" #endif diff --git a/tests/src/test_compressors.cpp b/tests/src/test_compressors.cpp index 8ab071fb6f..24795a0c4e 100644 --- a/tests/src/test_compressors.cpp +++ b/tests/src/test_compressors.cpp @@ -9,6 +9,8 @@ #include "compressors.hpp" #include "format.hpp" +#include "testing_utilities.hpp" + namespace { template @@ -28,8 +30,73 @@ void test_huffman_identity(std::default_random_engine &gen, delete[] decompressed; } +void test_huffman_compression_regression(long int *const src, + const std::size_t srcLen) { + long int *const src_ = new long int[srcLen]; + std::copy(src, src + srcLen, src_); + + const mgard::MemoryBuffer out = + mgard::compress_memory_huffman(src, srcLen); + const mgard::MemoryBuffer out_ = + mgard::compress_memory_huffman(src_, srcLen); + + REQUIRE(out.size == out_.size); + unsigned char const *const p = out.data.get(); + unsigned char const *const p_ = out_.data.get(); + REQUIRE(std::equal(p, p + out.size, p_)); + + delete[] src_; +} + +void test_hcr_constant(const std::size_t srcLen, const long int q) { + long int *const src = new long int[srcLen]; + std::fill(src, src + srcLen, q); + test_huffman_compression_regression(src, srcLen); + delete[] src; +} + +void test_hcr_periodic(const std::size_t srcLen, const long int initial, + const std::size_t period) { + long int *const src = new long int[srcLen]; + std::generate(src, src + srcLen, PeriodicGenerator(period, initial)); + test_huffman_compression_regression(src, srcLen); + delete[] src; +} + +void test_hcr_random(const std::size_t srcLen, const long int a, + const long int b, std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + long int *const src = new long int[srcLen]; + std::generate(src, src + srcLen, [&] { return dis(gen); }); + test_huffman_compression_regression(src, srcLen); + delete[] src; +} + } // namespace +TEST_CASE("Huffman compression regression", "[compressors] [regression]") { + SECTION("constant data") { + test_hcr_constant(5, -3); + test_hcr_constant(25, 0); + test_hcr_constant(625, 81); + } + + SECTION("periodic data") { + test_hcr_periodic(5, 0, 5); + test_hcr_periodic(25, -4, 6); + test_hcr_periodic(625, 22, 20); + } + + SECTION("random data") { + std::default_random_engine gen(131051); + test_hcr_random(50, 0, 1, gen); + test_hcr_random(25, -8, 16, gen); + test_hcr_random(625, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_hcr_random(3125, -100, 100, gen); + } +} + TEST_CASE("Huffman compression", "[compressors] [!mayfail]") { std::default_random_engine gen(257100); const std::size_t n = 5000; diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index 2b31eb4c9e..d2c6cdfd3c 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -45,29 +45,6 @@ void test_encoding_regression_constant(const std::size_t N, const long int q) { delete[] quantized; } -//! Function object to generate periodict data. -struct PeriodicGenerator { - //! Constructor. - //! - //!\param value Starting value. - //!\param period Generator period. - PeriodicGenerator(const std::size_t period, const long int value) - : period(period), value(value), ncalls(0) {} - - //! Generator period. - std::size_t period; - - //! Starting value. - long int value; - - //! Number of times `operator()` has been called. - std::size_t ncalls; - - long int operator()() { - return value + static_cast(ncalls++ % period); - } -}; - void test_encoding_regression_periodic(const std::size_t N, const long int q, const std::size_t period) { long int *const quantized = new long int[N]; diff --git a/tests/src/testing_utilities.cpp b/tests/src/testing_utilities.cpp index 822c6c87e3..d84210d5fd 100644 --- a/tests/src/testing_utilities.cpp +++ b/tests/src/testing_utilities.cpp @@ -20,3 +20,11 @@ std::ostream &operator<<(std::ostream &os, const TrialTracker &tracker) { return os << tracker.nsuccesses << " successes and " << tracker.nfailures << " failures out of " << tracker.ntrials << " trials"; } + +PeriodicGenerator::PeriodicGenerator(const std::size_t period, + const long int value) + : period(period), value(value), ncalls(0) {} + +long int PeriodicGenerator::operator()() { + return value + static_cast(ncalls++ % period); +} From 58c13e3c9fc693a7244df73164cdbdb47dda2d15 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 2 Jun 2022 16:58:06 -0400 Subject: [PATCH 20/58] Reimplement Huffman compression with constituents. --- include/compressors.hpp | 8 +++ src/compressors.cpp | 104 +++++++++++++++++++++++++++++++-- tests/src/test_compressors.cpp | 2 +- 3 files changed, 109 insertions(+), 5 deletions(-) diff --git a/include/compressors.hpp b/include/compressors.hpp index 17cd7f7ce3..8e0952022a 100644 --- a/include/compressors.hpp +++ b/include/compressors.hpp @@ -23,6 +23,14 @@ namespace mgard { MemoryBuffer compress_memory_huffman(long int *const src, const std::size_t srcLen); +//! Compress an array using a Huffman tree. +//! +//!\param[in] src Array to be compressed. +//!\param[in] srcLen Size of array (number of elements) to be compressed. +MemoryBuffer +compress_memory_huffman_rewritten(long int *const src, + const std::size_t srcLen); + //! Decompress an array compressed with `compress_memory_huffman`. //! //!\param[in] src Compressed array. diff --git a/src/compressors.cpp b/src/compressors.cpp index bf5bd7c57f..84ebe9caf7 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -68,6 +69,27 @@ void decompress_memory_huffman(unsigned char *const src, free(huffman_encoding_p); } +namespace { + +using Constituent = std::pair; + +MemoryBuffer +gather_constituents(const std::vector &constituents) { + std::size_t nbuffer = 0; + for (const Constituent &constituent : constituents) { + nbuffer += constituent.second; + } + MemoryBuffer buffer(nbuffer); + unsigned char *p = buffer.data.get(); + for (const Constituent &constituent : constituents) { + std::memcpy(p, constituent.first, constituent.second); + p += constituent.second; + } + return buffer; +} + +} // namespace + MemoryBuffer compress_memory_huffman(long int *const src, const std::size_t srcLen) { #ifdef MGARD_TIMING @@ -89,13 +111,13 @@ MemoryBuffer compress_memory_huffman(long int *const src, "code written assuming `sizeof(unsigned int) == 4`"); const std::size_t offset = encoded.nbits % (CHAR_BIT * sizeof(unsigned int)); // Number of hit buffer padding bytes. - const std::size_t nhpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); + const std::size_t nhbpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); - assert(encoded.hit.size + nhpb == + assert(encoded.hit.size + nhbpb == encoded.nbits / CHAR_BIT + sizeof(unsigned int)); const size_t npayload = - encoded.hit.size + nhpb + encoded.missed.size + encoded.frequencies.size; + encoded.hit.size + nhbpb + encoded.missed.size + encoded.frequencies.size; unsigned char *const payload = new unsigned char[npayload]; unsigned char *bufp = payload; @@ -107,7 +129,7 @@ MemoryBuffer compress_memory_huffman(long int *const src, { const unsigned char zero{0}; - for (std::size_t i = 0; i < nhpb; ++i) { + for (std::size_t i = 0; i < nhbpb; ++i) { std::memcpy(bufp, &zero, 1); bufp += 1; } @@ -166,6 +188,80 @@ MemoryBuffer compress_memory_huffman(long int *const src, return MemoryBuffer(buffer, bufferLen); } +MemoryBuffer +compress_memory_huffman_rewritten(long int *const src, + const std::size_t srcLen) { +#ifdef MGARD_TIMING + auto huff_time1 = std::chrono::high_resolution_clock::now(); +#endif + HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); + + assert(not(encoded.hit.size % sizeof(unsigned int))); + +#ifdef MGARD_TIMING + auto huff_time2 = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + huff_time2 - huff_time1); + std::cout << "Huffman tree time = " << (double)duration.count() / 1000000 + << "\n"; +#endif + static_assert(CHAR_BIT == 8, "code written assuming `CHAR_BIT == 8`"); + static_assert(sizeof(unsigned int) == 4, + "code written assuming `sizeof(unsigned int) == 4`"); + const std::size_t offset = encoded.nbits % (CHAR_BIT * sizeof(unsigned int)); + // Number of hit buffer padding bytes. + const std::size_t nhbpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); + + assert(encoded.hit.size + nhbpb == + encoded.nbits / CHAR_BIT + sizeof(unsigned int)); + + unsigned char const *hbpb = new unsigned char[nhbpb](); + MemoryBuffer payload = gather_constituents({ + {encoded.frequencies.data.get(), encoded.frequencies.size}, + {encoded.hit.data.get(), encoded.hit.size}, + {hbpb, nhbpb}, + {encoded.missed.data.get(), encoded.missed.size}, + }); + delete[] hbpb; + +#ifndef MGARD_ZSTD +#ifdef MGARD_TIMING + auto z_time1 = std::chrono::high_resolution_clock::now(); +#endif + const MemoryBuffer out_data = + compress_memory_z(payload.data.get(), payload.size); +#ifdef MGARD_TIMING + auto z_time2 = std::chrono::high_resolution_clock::now(); + auto z_duration = + std::chrono::duration_cast(z_time2 - z_time1); + std::cout << "ZLIB compression time = " + << (double)z_duration.count() / 1000000 << "\n"; +#endif +#else +#ifdef MGARD_TIMING + auto zstd_time1 = std::chrono::high_resolution_clock::now(); +#endif + const MemoryBuffer out_data = + compress_memory_zstd(payload.data.get(), payload.size); +#ifdef MGARD_TIMING + auto zstd_time2 = std::chrono::high_resolution_clock::now(); + auto zstd_duration = std::chrono::duration_cast( + zstd_time2 - zstd_time1); + std::cout << "ZSTD compression time = " + << (double)zstd_duration.count() / 1000000 << "\n"; +#endif +#endif + + return gather_constituents( + {{reinterpret_cast(&encoded.frequencies.size), + sizeof(encoded.frequencies.size)}, + {reinterpret_cast(&encoded.nbits), + sizeof(encoded.nbits)}, + {reinterpret_cast(&encoded.missed.size), + sizeof(encoded.missed.size)}, + {out_data.data.get(), out_data.size}}); +} + #ifdef MGARD_ZSTD /*! CHECK * Check that the condition holds. If it doesn't print a message and die. diff --git a/tests/src/test_compressors.cpp b/tests/src/test_compressors.cpp index 24795a0c4e..74da33a86b 100644 --- a/tests/src/test_compressors.cpp +++ b/tests/src/test_compressors.cpp @@ -38,7 +38,7 @@ void test_huffman_compression_regression(long int *const src, const mgard::MemoryBuffer out = mgard::compress_memory_huffman(src, srcLen); const mgard::MemoryBuffer out_ = - mgard::compress_memory_huffman(src_, srcLen); + mgard::compress_memory_huffman_rewritten(src_, srcLen); REQUIRE(out.size == out_.size); unsigned char const *const p = out.data.get(); From 69d0e72c4817446f0c6e6bd1dc827571c2b10c54 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Fri, 3 Jun 2022 10:50:40 -0400 Subject: [PATCH 21/58] Remove timing statements. --- src/compressors.cpp | 65 --------------------------------------------- 1 file changed, 65 deletions(-) diff --git a/src/compressors.cpp b/src/compressors.cpp index 84ebe9caf7..7e8064b8f2 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -16,11 +16,6 @@ #include "huffman.hpp" #include "utilities.hpp" -#ifdef MGARD_TIMING -#include -#include -#endif - #ifdef MGARD_ZSTD #include #endif @@ -92,20 +87,10 @@ gather_constituents(const std::vector &constituents) { MemoryBuffer compress_memory_huffman(long int *const src, const std::size_t srcLen) { -#ifdef MGARD_TIMING - auto huff_time1 = std::chrono::high_resolution_clock::now(); -#endif HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); assert(not(encoded.hit.size % sizeof(unsigned int))); -#ifdef MGARD_TIMING - auto huff_time2 = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - huff_time2 - huff_time1); - std::cout << "Huffman tree time = " << (double)duration.count() / 1000000 - << "\n"; -#endif static_assert(CHAR_BIT == 8, "code written assuming `CHAR_BIT == 8`"); static_assert(sizeof(unsigned int) == 4, "code written assuming `sizeof(unsigned int) == 4`"); @@ -139,31 +124,11 @@ MemoryBuffer compress_memory_huffman(long int *const src, bufp += encoded.missed.size; #ifndef MGARD_ZSTD -#ifdef MGARD_TIMING - auto z_time1 = std::chrono::high_resolution_clock::now(); -#endif const MemoryBuffer out_data = compress_memory_z(payload, npayload); -#ifdef MGARD_TIMING - auto z_time2 = std::chrono::high_resolution_clock::now(); - auto z_duration = - std::chrono::duration_cast(z_time2 - z_time1); - std::cout << "ZLIB compression time = " - << (double)z_duration.count() / 1000000 << "\n"; -#endif #else -#ifdef MGARD_TIMING - auto zstd_time1 = std::chrono::high_resolution_clock::now(); -#endif const MemoryBuffer out_data = compress_memory_zstd(payload, npayload); -#ifdef MGARD_TIMING - auto zstd_time2 = std::chrono::high_resolution_clock::now(); - auto zstd_duration = std::chrono::duration_cast( - zstd_time2 - zstd_time1); - std::cout << "ZSTD compression time = " - << (double)zstd_duration.count() / 1000000 << "\n"; -#endif #endif delete[] payload; bufp = nullptr; @@ -191,20 +156,10 @@ MemoryBuffer compress_memory_huffman(long int *const src, MemoryBuffer compress_memory_huffman_rewritten(long int *const src, const std::size_t srcLen) { -#ifdef MGARD_TIMING - auto huff_time1 = std::chrono::high_resolution_clock::now(); -#endif HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); assert(not(encoded.hit.size % sizeof(unsigned int))); -#ifdef MGARD_TIMING - auto huff_time2 = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - huff_time2 - huff_time1); - std::cout << "Huffman tree time = " << (double)duration.count() / 1000000 - << "\n"; -#endif static_assert(CHAR_BIT == 8, "code written assuming `CHAR_BIT == 8`"); static_assert(sizeof(unsigned int) == 4, "code written assuming `sizeof(unsigned int) == 4`"); @@ -225,31 +180,11 @@ compress_memory_huffman_rewritten(long int *const src, delete[] hbpb; #ifndef MGARD_ZSTD -#ifdef MGARD_TIMING - auto z_time1 = std::chrono::high_resolution_clock::now(); -#endif const MemoryBuffer out_data = compress_memory_z(payload.data.get(), payload.size); -#ifdef MGARD_TIMING - auto z_time2 = std::chrono::high_resolution_clock::now(); - auto z_duration = - std::chrono::duration_cast(z_time2 - z_time1); - std::cout << "ZLIB compression time = " - << (double)z_duration.count() / 1000000 << "\n"; -#endif #else -#ifdef MGARD_TIMING - auto zstd_time1 = std::chrono::high_resolution_clock::now(); -#endif const MemoryBuffer out_data = compress_memory_zstd(payload.data.get(), payload.size); -#ifdef MGARD_TIMING - auto zstd_time2 = std::chrono::high_resolution_clock::now(); - auto zstd_duration = std::chrono::duration_cast( - zstd_time2 - zstd_time1); - std::cout << "ZSTD compression time = " - << (double)zstd_duration.count() / 1000000 << "\n"; -#endif #endif return gather_constituents( From f049be1af7042faa4df8abf67c3bae74fb906ec8 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Fri, 3 Jun 2022 12:03:30 -0400 Subject: [PATCH 22/58] Return struct from original Huffman decoder. --- include/huffman.hpp | 15 +------ src/compressors.cpp | 91 ++++++++++++++++++++++---------------- src/huffman.cpp | 36 ++++++++------- tests/src/test_huffman.cpp | 2 +- 4 files changed, 76 insertions(+), 68 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 58a47d02b5..3749cf288d 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -56,19 +56,8 @@ huffman_encoding_rewritten(long int const *const quantized_data, //! Decode a stream encoded using a Huffman code. //! -//!\param[out] quantized_data Output buffer (quantized coefficients). -//!\param[in] quantized_data_size Size *in bytes* of output buffer. -//!\param[in] out_data_hit Compressed buffer. -//!\param[in] out_data_hit_size Size *in bits* of compressed buffer. -//!\param[in] out_data_miss 'Missed' buffer (input symbols not assigned codes). -//!\param[in] out_data_miss_size Size *in bytes* of 'missed' buffer. -//!\param[in] out_tree Frequency table for input buffer. -//!\param[in] out_tree_size Size *in bytes* of the frequency table. -void huffman_decoding( - long int *const quantized_data, const std::size_t quantized_data_size, - unsigned char const *const out_data_hit, const size_t out_data_hit_size, - unsigned char const *const out_data_miss, const size_t out_data_miss_size, - unsigned char const *const out_tree, const size_t out_tree_size); +//!\param[in] encoded Input buffer (Huffman-encoded stream). +MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded); //! Codeword (in progress) associated to a node in a Huffman code creation tree. struct HuffmanCodeword { diff --git a/src/compressors.cpp b/src/compressors.cpp index 7e8064b8f2..151712e298 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -22,46 +22,65 @@ namespace mgard { +namespace { + +std::size_t hit_buffer_size(const std::size_t nbits) { + return nbits / CHAR_BIT + sizeof(unsigned int); +} + +} // namespace + void decompress_memory_huffman(unsigned char *const src, const std::size_t srcLen, long int *const dst, const std::size_t dstLen) { - unsigned char *out_data_hit = 0; - size_t out_data_hit_size; - unsigned char *out_data_miss = 0; - size_t out_data_miss_size; - unsigned char *out_tree = 0; - size_t out_tree_size; - - unsigned char *buf = src; - - out_tree_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_hit_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_miss_size = *(size_t *)buf; - buf += sizeof(size_t); - size_t total_huffman_size = out_tree_size + out_data_hit_size / CHAR_BIT + - sizeof(unsigned int) + out_data_miss_size; - unsigned char *huffman_encoding_p = - (unsigned char *)malloc(total_huffman_size); + std::size_t const *const sizes = reinterpret_cast(src); + const std::size_t nfrequencies = sizes[0]; + const std::size_t nbits = sizes[1]; + const std::size_t nmissed = sizes[2]; + const std::size_t nhit = hit_buffer_size(nbits); + + MemoryBuffer buffer(nfrequencies + nhit + nmissed); + { + const std::size_t offset = 3 * sizeof(std::size_t); + unsigned char const *const src_ = src + offset; + const std::size_t srcLen_ = srcLen - offset; + unsigned char *const dst_ = buffer.data.get(); + const std::size_t dstLen_ = buffer.size; + #ifndef MGARD_ZSTD - decompress_memory_z(buf, srcLen - 3 * sizeof(size_t), huffman_encoding_p, - total_huffman_size); + decompress_memory_z(src_, srcLen_, dst_, dstLen_); #else - decompress_memory_zstd(buf, srcLen - 3 * sizeof(size_t), huffman_encoding_p, - total_huffman_size); + decompress_memory_zstd(src_, srcLen_, dst_, dstLen_); #endif - out_tree = huffman_encoding_p; - out_data_hit = huffman_encoding_p + out_tree_size; - out_data_miss = huffman_encoding_p + out_tree_size + - out_data_hit_size / CHAR_BIT + sizeof(unsigned int); + } - huffman_decoding(dst, dstLen, out_data_hit, out_data_hit_size, out_data_miss, - out_data_miss_size, out_tree, out_tree_size); + HuffmanEncodedStream encoded(nbits, nhit, nmissed, nfrequencies); + { + unsigned char const *begin; + unsigned char const *end; + + begin = buffer.data.get(); + end = begin + nfrequencies; + std::copy(begin, end, encoded.frequencies.data.get()); + + begin = end; + end = begin + nhit; + std::copy(begin, end, encoded.hit.data.get()); + + begin = end; + end = begin + nmissed; + std::copy(begin, end, encoded.missed.data.get()); + } - free(huffman_encoding_p); + const MemoryBuffer decoded = huffman_decoding(encoded); + { + long int const *const p = decoded.data.get(); + if (decoded.size * sizeof(*p) != dstLen) { + throw std::runtime_error( + "mismatch between expected and obtained decompressed buffer sizes"); + } + std::copy(p, p + decoded.size, dst); + } } namespace { @@ -98,8 +117,7 @@ MemoryBuffer compress_memory_huffman(long int *const src, // Number of hit buffer padding bytes. const std::size_t nhbpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); - assert(encoded.hit.size + nhbpb == - encoded.nbits / CHAR_BIT + sizeof(unsigned int)); + assert(encoded.hit.size + nhbpb == hit_buffer_size(encoded.nbits)); const size_t npayload = encoded.hit.size + nhbpb + encoded.missed.size + encoded.frequencies.size; @@ -156,7 +174,7 @@ MemoryBuffer compress_memory_huffman(long int *const src, MemoryBuffer compress_memory_huffman_rewritten(long int *const src, const std::size_t srcLen) { - HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); + const HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); assert(not(encoded.hit.size % sizeof(unsigned int))); @@ -167,8 +185,7 @@ compress_memory_huffman_rewritten(long int *const src, // Number of hit buffer padding bytes. const std::size_t nhbpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); - assert(encoded.hit.size + nhbpb == - encoded.nbits / CHAR_BIT + sizeof(unsigned int)); + assert(encoded.hit.size + nhbpb == hit_buffer_size(encoded.nbits)); unsigned char const *hbpb = new unsigned char[nhbpb](); MemoryBuffer payload = gather_constituents({ diff --git a/src/huffman.cpp b/src/huffman.cpp index 937c5d9b6a..b53e072237 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -422,23 +422,28 @@ huffman_encoding_rewritten(long int const *const quantized_data, return out; } -void huffman_decoding(long int *const quantized_data, - const std::size_t quantized_data_size, - unsigned char const *const out_data_hit, - const std::size_t out_data_hit_size, - unsigned char const *const out_data_miss, - const std::size_t out_data_miss_size, - unsigned char const *const out_tree, - const std::size_t out_tree_size) { +MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { + const std::size_t out_data_miss_size = encoded.missed.size; + const std::size_t out_tree_size = encoded.frequencies.size; + unsigned char const *const out_data_hit = encoded.hit.data.get(); + unsigned char const *const out_data_miss = encoded.missed.data.get(); + unsigned char const *const out_tree = encoded.frequencies.data.get(); + std::size_t const *const cft = (std::size_t const *)out_tree; - const int nonZeros = out_tree_size / (2 * sizeof(std::size_t)); + const std::size_t nnz = out_tree_size / (2 * sizeof(std::size_t)); // The elements of the array are value-initialized (here, zero-initialized). std::size_t *const ft = new std::size_t[nql](); - for (int j = 0; j < nonZeros; j++) { - ft[cft[2 * j]] = cft[2 * j + 1]; + std::size_t nquantized = 0; + for (std::size_t j = 0; j < nnz; ++j) { + const std::size_t frequency = cft[2 * j + 1]; + nquantized += frequency; + ft[cft[2 * j]] = frequency; } + MemoryBuffer out(nquantized); + long int *const quantized_data = out.data.get(); + my_priority_queue *const phtree = build_tree(ft); delete[] ft; @@ -460,7 +465,7 @@ void huffman_decoding(long int *const quantized_data, long int *q = quantized_data; std::size_t i = 0; std::size_t num_missed = 0; - while (q < (quantized_data + (quantized_data_size / sizeof(*q)))) { + while (q < quantized_data + nquantized) { htree_node const *root = phtree->top(); assert(root); @@ -504,13 +509,10 @@ void huffman_decoding(long int *const quantized_data, assert(start_bit == out_data_hit_size); assert(sizeof(int) * num_missed == out_data_miss_size); - // Avoid unused argument warning. If NDEBUG is defined, then the assert - // becomes empty and out_data_hit_size is unused. Tell the compiler that - // is OK and expected. - (void)out_data_hit_size; - delete[] miss_buf; free_tree(phtree); + + return out; } } // namespace mgard diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index d2c6cdfd3c..b271581ceb 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -65,7 +65,7 @@ void test_encoding_regression_random(const std::size_t N, const long int a, } // namespace -TEST_CASE("encoding regression", "[huffman]") { +TEST_CASE("encoding regression", "[huffman] [regression]") { SECTION("constant data") { test_encoding_regression_constant(10, 0); test_encoding_regression_constant(100, 732); From 81cc00bcb22f637fa13b8996dce0aec4a12fbecf Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Fri, 3 Jun 2022 12:32:54 -0400 Subject: [PATCH 23/58] Add Huffman decoding regression tests. --- tests/src/test_huffman.cpp | 69 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index b271581ceb..9d1829b290 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -38,6 +38,27 @@ void test_encoding_regression(long int *const quantized, const std::size_t N) { delete[] quantized_new; } +void test_decoding_regression(long int *const quantized, const std::size_t N) { + long int *const quantized_new = new long int[N]; + std::copy(quantized, quantized + N, quantized_new); + + const mgard::HuffmanEncodedStream encoded = + mgard::huffman_encoding(quantized, N); + const mgard::HuffmanEncodedStream encoded_new = + mgard::huffman_encoding(quantized_new, N); + + delete[] quantized_new; + + const mgard::MemoryBuffer out = mgard::huffman_decoding(encoded); + const mgard::MemoryBuffer out_new = + mgard::huffman_decoding(encoded); + + REQUIRE(out.size == out_new.size); + long int const *const p = out.data.get(); + long int const *const p_new = out_new.data.get(); + REQUIRE(std::equal(p, p + out.size, p_new)); +} + void test_encoding_regression_constant(const std::size_t N, const long int q) { long int *const quantized = new long int[N]; std::fill(quantized, quantized + N, q); @@ -63,6 +84,31 @@ void test_encoding_regression_random(const std::size_t N, const long int a, delete[] quantized; } +void test_decoding_regression_constant(const std::size_t N, const long int q) { + long int *const quantized = new long int[N]; + std::fill(quantized, quantized + N, q); + test_decoding_regression(quantized, N); + delete[] quantized; +} + +void test_decoding_regression_periodic(const std::size_t N, const long int q, + const std::size_t period) { + long int *const quantized = new long int[N]; + std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); + test_decoding_regression(quantized, N); + delete[] quantized; +} + +void test_decoding_regression_random(const std::size_t N, const long int a, + const long int b, + std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + long int *const quantized = new long int[N]; + std::generate(quantized, quantized + N, [&] { return dis(gen); }); + test_decoding_regression(quantized, N); + delete[] quantized; +} + } // namespace TEST_CASE("encoding regression", "[huffman] [regression]") { @@ -87,3 +133,26 @@ TEST_CASE("encoding regression", "[huffman] [regression]") { test_encoding_regression_random(10000, -100, 100, gen); } } + +TEST_CASE("decoding regression", "[huffman] [regression]") { + SECTION("constant data") { + test_decoding_regression_constant(10, -11); + test_decoding_regression_constant(100, 79); + test_decoding_regression_constant(1000, -7296); + } + + SECTION("periodic data") { + test_decoding_regression_periodic(10, 12, 4); + test_decoding_regression_periodic(100, -71, 9); + test_decoding_regression_periodic(1000, 3280, 23); + } + + SECTION("random data") { + std::default_random_engine gen(363022); + test_decoding_regression_random(10, 0, 1, gen); + test_decoding_regression_random(100, -15, -5, gen); + test_decoding_regression_random(1000, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_decoding_regression_random(10000, -100, 100, gen); + } +} From 0da8b2c0e1f7ca3b89ffc106205fa1fbc6f78935 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Mon, 6 Jun 2022 11:19:01 -0400 Subject: [PATCH 24/58] Reimplement Huffman decoding with `Bits`. --- include/huffman.hpp | 6 ++ src/huffman.cpp | 114 ++++++++++++++++++++++++++++++++----- tests/src/test_huffman.cpp | 2 +- 3 files changed, 107 insertions(+), 15 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 3749cf288d..07792cdab6 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -59,6 +59,12 @@ huffman_encoding_rewritten(long int const *const quantized_data, //!\param[in] encoded Input buffer (Huffman-encoded stream). MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded); +//! Decode a stream encoded using a Huffman code. +//! +//!\param[in] encoded Input buffer (Huffman-encoded stream). +MemoryBuffer +huffman_decoding_rewritten(const HuffmanEncodedStream &encoded); + //! Codeword (in progress) associated to a node in a Huffman code creation tree. struct HuffmanCodeword { //! Bytes containing the bits of the codeword. diff --git a/src/huffman.cpp b/src/huffman.cpp index b53e072237..11725d219b 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -240,6 +240,28 @@ HuffmanCodec build_huffman_codec(long int *const quantized_data, return codec; } +namespace { + +void endianness_shuffle(unsigned char *const buffer, const std::size_t nbytes) { + if (nbytes % sizeof(unsigned int)) { + throw std::runtime_error( + "buffer size not a multiple of `sizeof(unsigned int)`"); + } + const unsigned int one{1}; + const bool little_endian = *reinterpret_cast(&one); + if (little_endian) { + for (std::size_t i = 0; i < nbytes; i += sizeof(unsigned int)) { + unsigned char *a = buffer + i; + unsigned char *b = a + sizeof(unsigned int) - 1; + for (std::size_t j = 0; j < sizeof(unsigned int) / 2; ++j) { + std::swap(*a++, *b--); + } + } + } +} + +} // namespace + HuffmanEncodedStream huffman_encoding(long int *const quantized_data, const std::size_t n) { const HuffmanCodec codec = build_huffman_codec(quantized_data, n); @@ -405,20 +427,7 @@ huffman_encoding_rewritten(long int const *const quantized_data, } } - { - const unsigned int one{1}; - const bool little_endian = *reinterpret_cast(&one); - if (little_endian) { - for (std::size_t i = 0; i < nbytes; i += sizeof(unsigned int)) { - unsigned char *a = buffer + i; - unsigned char *b = a + sizeof(unsigned int) - 1; - for (std::size_t j = 0; j < sizeof(unsigned int) / 2; ++j) { - std::swap(*a++, *b--); - } - } - } - } - + endianness_shuffle(buffer, nbytes); return out; } @@ -515,4 +524,81 @@ MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { return out; } +MemoryBuffer +huffman_decoding_rewritten(const HuffmanEncodedStream &encoded) { + std::size_t const *const cft = + reinterpret_cast(encoded.frequencies.data.get()); + const std::size_t nnz = encoded.frequencies.size / (2 * sizeof(std::size_t)); + // The elements of the array are value-initialized (here, zero-initialized). + std::size_t *const ft = new std::size_t[nql](); + + std::size_t nquantized = 0; + for (std::size_t j = 0; j < nnz; ++j) { + const std::size_t frequency = cft[2 * j + 1]; + nquantized += frequency; + ft[cft[2 * j]] = frequency; + } + + MemoryBuffer out(nquantized); + long int *q = out.data.get(); + + my_priority_queue *const phtree = build_tree(ft); + delete[] ft; + + // The encoded.missed.data.get() may not be aligned. Therefore, the code + // here makes a new buffer. + assert(not(encoded.missed.size % sizeof(int))); + int *const missed = new int[encoded.missed.size / sizeof(int)]; + std::memcpy(missed, encoded.missed.data.get(), encoded.missed.size); + + int const *p_missed = missed; + + const std::size_t nbytes = encoded.hit.size; + unsigned char *const buffer = new unsigned char[nbytes]; + { + unsigned char const *const p = encoded.hit.data.get(); + std::copy(p, p + nbytes, buffer); + } + endianness_shuffle(buffer, nbytes); + const Bits bits(buffer, buffer + encoded.nbits / CHAR_BIT, + encoded.nbits % CHAR_BIT); + + std::size_t nbits = 0; + std::size_t nmissed = 0; + htree_node const *const root = phtree->top(); + assert(root); + Bits::iterator p_ = bits.begin(); + for (std::size_t i = 0; i < nquantized; ++i) { + htree_node const *node = root; + + std::size_t len = 0; + while (node->left) { + node = *p_++ ? node->right : node->left; + ++len; + } + + if (node->q) { + *q = node->q - nql / 2; + } else { + *q = *p_missed - nql / 2; + + ++p_missed; + ++nmissed; + } + + ++q; + nbits += len; + } + + assert(nbits == encoded.nbits); + assert(sizeof(int) * nmissed == encoded.missed.size); + + delete[] missed; + free_tree(phtree); + + delete[] buffer; + + return out; +} + } // namespace mgard diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index 9d1829b290..5fed4bedec 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -51,7 +51,7 @@ void test_decoding_regression(long int *const quantized, const std::size_t N) { const mgard::MemoryBuffer out = mgard::huffman_decoding(encoded); const mgard::MemoryBuffer out_new = - mgard::huffman_decoding(encoded); + mgard::huffman_decoding_rewritten(encoded); REQUIRE(out.size == out_new.size); long int const *const p = out.data.get(); From 55dc22dd4b3858f770a171c04e0c4fbb43318ae7 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Mon, 6 Jun 2022 19:52:48 -0400 Subject: [PATCH 25/58] Use `HuffmanCode` in decoding reimplementation. --- include/huffman.hpp | 76 +++++++++++++++++++++-- include/huffman.tpp | 147 +++++++++++++++++++++++++++----------------- src/huffman.cpp | 98 +++++++++++++++-------------- 3 files changed, 213 insertions(+), 108 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 07792cdab6..d98f5b27e7 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include "utilities.hpp" @@ -118,8 +120,17 @@ struct CodeCreationTreeNode { }; //! Huffman code generated from/for an input stream. +//! +//!\note The construction of this class is a little convoluted. template class HuffmanCode { public: + static_assert(std::is_integral::value and + std::is_signed::value, + "symbol type must be signed and integral"); + + //! Shared pointer to node in Huffman code creation tree. + using Node = std::shared_ptr; + //! Constructor. //! //!\param ncodewords Number of symbols that will be assigned codewords. @@ -128,6 +139,13 @@ template class HuffmanCode { HuffmanCode(const std::size_t ncodewords, Symbol const *const begin, Symbol const *const end); + //! Constructor. + //! + //!\param ncodewords Number of symbols that will be assigned codewords. + //!\param pairs Index–frequency pairs for frequency table. + HuffmanCode(const std::size_t ncodewords, + const std::vector> &pairs); + //! Number of symbols that will be assigned codewords. std::size_t ncodewords; @@ -137,7 +155,8 @@ template class HuffmanCode { //! Codewords associated to the symbols. std::vector codewords; - //! Report the number of out-of-range symbols encountered in the stream. + //! Report the number of out-of-range symbols encountered in the stream or + //! given in the frequency table pairs. std::size_t nmissed() const; //! Check whether a symbol is eligible for a codeword. @@ -147,11 +166,58 @@ template class HuffmanCode { std::size_t index(const Symbol symbol) const; private: - //! Smallest symbol (inclusive) to receive a codeword. - Symbol min_symbol; + //! Function object used to compare code creation tree nodes. + struct HeldCountGreater { + bool operator()(const Node &a, const Node &b) const; + }; + +public: + //! Huffman code creation tree. + std::priority_queue, HeldCountGreater> queue; + + //! Decode a codeword (identified by associated leaf) to a symbol. + //! + //!\pre `leaf` must be a leaf (rather than an interior node) of the code + //! creation tree. + //! + //!\param leaf Leaf (associated to a codeword) to decode. + //!\param missed Pointer to next out-of-range symbol. If `leaf` is associated + //! to the out-of-range codeword, this pointer will be dereferenced and + //! incremented. + Symbol decode(const Node &leaf, Symbol const *&missed) const; - //! Largest symbol (inclusive) to receive a codeword. - Symbol max_symbol; +private: + //! Smallest and largest symbols (inclusive) to receive codewords. + std::pair endpoints; + + //! Set the range of symbols that will be assigned codewords. + //! + //!\note This function depends on `ncodewords`. + void set_endpoints(); + + //! Populate the frequency table using a stream of symbols. + //! + //!\pre `frequencies` should have length `ncodewords` and all entries should + //! be zero. + //! + //!\param begin Beginning of stream of symbols. + //!\param end End of stream of symbols. + void populate_frequencies(Symbol const *const begin, Symbol const *const end); + + //! Populate the frequency table from a collection of index–frequency pairs. + //! + //!\pre `frequencies` should have length `ncodewords` and all entries should + //! be zero. + //! + //!\param pairs Beginning of stream of symbols. + //!\param end End of stream of symbols. + void populate_frequencies( + const std::vector> &pairs); + + //! Create the Huffman code creation tree. + //! + //!\note This function depends on `frequencies`. + void create_code_creation_tree(); // TODO: Check that frequency count ties aren't going to hurt us here. Stable // sorting algorithm in `priority_queue`? diff --git a/include/huffman.tpp b/include/huffman.tpp index da11ac5d97..8d6a7cc9f5 100644 --- a/include/huffman.tpp +++ b/include/huffman.tpp @@ -1,76 +1,64 @@ #include "utilities.hpp" #include +#include #include -#include #include -#include namespace mgard { -//! This is used in the instantization of `std::priority_queue`. -template struct HeldCountGreater { - bool operator()(const T &a, const T &b) const { return a->count > b->count; } -}; - template -HuffmanCode::HuffmanCode(const std::size_t ncodewords, - Symbol const *const begin, - Symbol const *const end) - : ncodewords(ncodewords), frequencies(ncodewords), codewords(ncodewords) { - static_assert(std::is_integral::value and - std::is_signed::value, - "symbol type must be signed and integral"); +bool HuffmanCode::HeldCountGreater:: +operator()(const typename HuffmanCode::Node &a, + const typename HuffmanCode::Node &b) const { + return a->count > b->count; +} + +template void HuffmanCode::set_endpoints() { // Haven't carefully checked what the minimum acceptable value is. if (not ncodewords) { throw std::invalid_argument("`ncodewords` must be positive."); } - { - const Symbol SYMBOL_MAX = std::numeric_limits::max(); - const Symbol SYMBOL_MIN = std::numeric_limits::min(); - - const std::size_t max_symbol_ = (ncodewords + 1) / 2 - 1; - const std::size_t opp_min_symbol_ = ncodewords / 2; - - // TODO: There is surely a better way of doing this. Lots of potential - // issues with directly comparing `opp_min_symbol_` and `-SYMBOL_MIN`. - // `-SYMBOL_MIN` can't necessarily be represented as a `Symbol`, for - // example. Trying to avoid overflows. - std::size_t a = opp_min_symbol_; - Symbol b = SYMBOL_MIN; - while (a) { - a /= 2; - b /= 2; - } - if (not b) { - // Only a "risk" because we haven't actually established that - // `opp_min_symbol_` is greater in magnitude than `SYMBOL_MIN`. - throw std::overflow_error( - "risk that minimum symbol cannot be represented in symbol type"); - } else if (opp_min_symbol_ > SYMBOL_MAX) { - throw std::overflow_error( - "opposite of minimum symbol canont be represented in symbol type"); - } else { - min_symbol = -static_cast(opp_min_symbol_); - } - - // `opp_min_symbol_` is either equal to or one greater than `max_symbol_`, - // and we checked above that `opp_min_symbol <= SYMBOL_MAX`. So, we know - // that `max_symbol_ <= SYMBOL_MAX` here. - max_symbol = max_symbol_; + const Symbol SYMBOL_MAX = std::numeric_limits::max(); + const Symbol SYMBOL_MIN = std::numeric_limits::min(); + + const std::size_t max_symbol_ = (ncodewords + 1) / 2 - 1; + const std::size_t opp_min_symbol_ = ncodewords / 2; + + // There is surely a better way of doing this. Lots of potential issues with + // directly comparing `opp_min_symbol_` and `-SYMBOL_MIN`. `-SYMBOL_MIN` + // can't necessarily be represented as a `Symbol`, for example. Trying to + // avoid overflows. + std::size_t a = opp_min_symbol_; + Symbol b = SYMBOL_MIN; + while (a) { + a /= 2; + b /= 2; } - for (const Symbol symbol : - RangeSlice{.begin_ = begin, .end_ = end}) { - ++frequencies.at(index(symbol)); + if (not b) { + // Only a "risk" because we haven't actually established that + // `opp_min_symbol_` is greater in magnitude than `SYMBOL_MIN`. + throw std::overflow_error( + "risk that minimum symbol cannot be represented in symbol type"); + } else if (opp_min_symbol_ > SYMBOL_MAX) { + throw std::overflow_error( + "opposite of minimum symbol canont be represented in symbol type"); + } else { + endpoints.first = -static_cast(opp_min_symbol_); } - using T = std::shared_ptr; - std::priority_queue, HeldCountGreater> queue; + // `opp_min_symbol_` is either equal to or one greater than `max_symbol_`, + // and we checked above that `opp_min_symbol <= SYMBOL_MAX`. So, we know + // that `max_symbol_ <= SYMBOL_MAX` here. + endpoints.second = max_symbol_; +} +template +void HuffmanCode::create_code_creation_tree() { // We can't quite use a `ZippedRange` here, I think, because - // `ZippedRange::iterator` doesn't expose the underlying iterators and - // we want a pointer to the codeword. + // `ZippedRange::iterator` doesn't expose the underlying iterators and we want + // a pointer to the codeword. typename std::vector::const_iterator p = frequencies.cbegin(); HuffmanCodeword *q = codewords.data(); for (std::size_t i = 0; i < ncodewords; ++i) { @@ -89,7 +77,54 @@ HuffmanCode::HuffmanCode(const std::size_t ncodewords, queue.push(std::make_shared(a, b)); } +} +template +void HuffmanCode::populate_frequencies(Symbol const *const begin, + Symbol const *const end) { + for (const Symbol symbol : + RangeSlice{.begin_ = begin, .end_ = end}) { + ++frequencies.at(index(symbol)); + } +} + +template +Symbol +HuffmanCode::decode(const typename HuffmanCode::Node &leaf, + Symbol const *&missed) const { + const std::ptrdiff_t offset = leaf->codeword - codewords.data(); + // If `offset == 0`, this is the leaf corresponding to out-of-range symbols. + assert(offset >= 0); + return offset ? endpoints.first + (offset - 1) : *missed++; +} + +template +void HuffmanCode::populate_frequencies( + const std::vector> &pairs) { + for (auto [index, frequency] : pairs) { + frequencies.at(index) = frequency; + } +} + +template +HuffmanCode::HuffmanCode(const std::size_t ncodewords, + Symbol const *const begin, + Symbol const *const end) + : ncodewords(ncodewords), frequencies(ncodewords), codewords(ncodewords) { + set_endpoints(); + populate_frequencies(begin, end); + create_code_creation_tree(); + recursively_set_codewords(queue.top(), {}); +} + +template +HuffmanCode::HuffmanCode( + const std::size_t ncodewords, + const std::vector> &pairs) + : ncodewords(ncodewords), frequencies(ncodewords), codewords(ncodewords) { + set_endpoints(); + populate_frequencies(pairs); + create_code_creation_tree(); recursively_set_codewords(queue.top(), {}); } @@ -99,12 +134,12 @@ template std::size_t HuffmanCode::nmissed() const { template bool HuffmanCode::out_of_range(const Symbol symbol) const { - return symbol < min_symbol or symbol > max_symbol; + return symbol < endpoints.first or symbol > endpoints.second; } template std::size_t HuffmanCode::index(const Symbol symbol) const { - return out_of_range(symbol) ? 0 : 1 + symbol - min_symbol; + return out_of_range(symbol) ? 0 : 1 + symbol - endpoints.first; } template diff --git a/src/huffman.cpp b/src/huffman.cpp index 11725d219b..e0b250c48f 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -5,12 +5,13 @@ #include #include - #include #include #include #include +#include + #include "huffman.hpp" namespace mgard { @@ -524,34 +525,55 @@ MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { return out; } +namespace { + +long int decode(const HuffmanCode &code, + const typename HuffmanCode::Node &leaf, + long int const *&missed) { + long int const *const start = missed; + long int decoded = code.decode(leaf, missed); + if (missed != start) { + decoded -= nql / 2; + } + return decoded; +} + +} // namespace + MemoryBuffer huffman_decoding_rewritten(const HuffmanEncodedStream &encoded) { - std::size_t const *const cft = - reinterpret_cast(encoded.frequencies.data.get()); - const std::size_t nnz = encoded.frequencies.size / (2 * sizeof(std::size_t)); - // The elements of the array are value-initialized (here, zero-initialized). - std::size_t *const ft = new std::size_t[nql](); + using Symbol = long int; + using MissedSymbol = int; + const std::size_t nnz = encoded.frequencies.size / (2 * sizeof(std::size_t)); + std::vector> pairs(nnz); std::size_t nquantized = 0; - for (std::size_t j = 0; j < nnz; ++j) { - const std::size_t frequency = cft[2 * j + 1]; - nquantized += frequency; - ft[cft[2 * j]] = frequency; + { + std::size_t const *p = + reinterpret_cast(encoded.frequencies.data.get()); + for (std::pair &pair : pairs) { + const std::size_t index = *p++; + const std::size_t frequency = *p++; + pair = {index, frequency}; + nquantized += frequency; + } } - MemoryBuffer out(nquantized); - long int *q = out.data.get(); - - my_priority_queue *const phtree = build_tree(ft); - delete[] ft; + const std::size_t ncodewords = nql - 1; + HuffmanCode code(ncodewords, pairs); - // The encoded.missed.data.get() may not be aligned. Therefore, the code - // here makes a new buffer. - assert(not(encoded.missed.size % sizeof(int))); - int *const missed = new int[encoded.missed.size / sizeof(int)]; - std::memcpy(missed, encoded.missed.data.get(), encoded.missed.size); + MemoryBuffer out(nquantized); + Symbol *q = out.data.get(); - int const *p_missed = missed; + assert(not(encoded.missed.size % sizeof(MissedSymbol))); + const std::size_t nmissed = encoded.missed.size / sizeof(MissedSymbol); + Symbol *const missed = new Symbol[nmissed]; + { + MissedSymbol const *const p = + reinterpret_cast(encoded.missed.data.get()); + std::copy(p, p + nmissed, missed); + } + Symbol const *p_missed = missed; const std::size_t nbytes = encoded.hit.size; unsigned char *const buffer = new unsigned char[nbytes]; @@ -564,38 +586,20 @@ huffman_decoding_rewritten(const HuffmanEncodedStream &encoded) { encoded.nbits % CHAR_BIT); std::size_t nbits = 0; - std::size_t nmissed = 0; - htree_node const *const root = phtree->top(); + const HuffmanCode::Node root = code.queue.top(); assert(root); - Bits::iterator p_ = bits.begin(); + Bits::iterator b = bits.begin(); for (std::size_t i = 0; i < nquantized; ++i) { - htree_node const *node = root; - - std::size_t len = 0; - while (node->left) { - node = *p_++ ? node->right : node->left; - ++len; - } - - if (node->q) { - *q = node->q - nql / 2; - } else { - *q = *p_missed - nql / 2; - - ++p_missed; - ++nmissed; - } - - ++q; - nbits += len; + HuffmanCode::Node node; + for (node = root; node->left; + node = *b++ ? node->right : node->left, ++nbits) + ; + *q++ = decode(code, node, p_missed); } - assert(nbits == encoded.nbits); - assert(sizeof(int) * nmissed == encoded.missed.size); + assert(sizeof(MissedSymbol) * (p_missed - missed) == encoded.missed.size); delete[] missed; - free_tree(phtree); - delete[] buffer; return out; From e50685dee9980e082a20970ec6480acdf9f7555a Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 7 Jun 2022 11:00:59 -0400 Subject: [PATCH 26/58] Add `sizeof` checks to Huffman reimplementations. --- src/huffman.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/huffman.cpp b/src/huffman.cpp index e0b250c48f..8acf634c26 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -346,9 +346,28 @@ HuffmanEncodedStream huffman_encoding(long int *const quantized_data, return out; } +namespace { + +void check_type_sizes() { + static_assert(CHAR_BIT == 8, + "code written with assumption that `CHAR_BIT == 8`"); + static_assert( + sizeof(unsigned int) == 4, + "code written with assumption that `sizeof(unsigned int) == 4`"); + static_assert(sizeof(int) == 4, + "code written with assumption that `sizeof(int) == 4`"); + static_assert( + sizeof(std::size_t) == 8, + "code written with assumption that `sizeof(unsigned int) == 8`"); +} + +} // namespace + HuffmanEncodedStream huffman_encoding_rewritten(long int const *const quantized_data, const std::size_t n) { + check_type_sizes(); + const std::size_t ncodewords = nql - 1; const HuffmanCode code(ncodewords, quantized_data, quantized_data + n); @@ -542,6 +561,8 @@ long int decode(const HuffmanCode &code, MemoryBuffer huffman_decoding_rewritten(const HuffmanEncodedStream &encoded) { + check_type_sizes(); + using Symbol = long int; using MissedSymbol = int; From 8be8f8798e8ea5d668d1c71a60308ecfa59244bf Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 7 Jun 2022 14:28:09 -0400 Subject: [PATCH 27/58] Remove `compress_memory_huffman` from library. --- include/compressors.hpp | 7 +- src/compressors.cpp | 73 +------------------- tests/CMakeLists.txt | 1 + tests/include/compressors_regression.hpp | 21 ++++++ tests/src/compressors_regression.cpp | 85 ++++++++++++++++++++++++ tests/src/test_compressors.cpp | 1 + 6 files changed, 112 insertions(+), 76 deletions(-) create mode 100644 tests/include/compressors_regression.hpp create mode 100644 tests/src/compressors_regression.cpp diff --git a/include/compressors.hpp b/include/compressors.hpp index 8e0952022a..b27865a8be 100644 --- a/include/compressors.hpp +++ b/include/compressors.hpp @@ -18,12 +18,7 @@ namespace mgard { //! Compress an array using a Huffman tree. //! -//!\param[in] src Array to be compressed. -//!\param[in] srcLen Size of array (number of elements) to be compressed. -MemoryBuffer compress_memory_huffman(long int *const src, - const std::size_t srcLen); - -//! Compress an array using a Huffman tree. +//!\deprecated //! //!\param[in] src Array to be compressed. //!\param[in] srcLen Size of array (number of elements) to be compressed. diff --git a/src/compressors.cpp b/src/compressors.cpp index 151712e298..2096b749e1 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -104,77 +104,10 @@ gather_constituents(const std::vector &constituents) { } // namespace -MemoryBuffer compress_memory_huffman(long int *const src, - const std::size_t srcLen) { - HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); - - assert(not(encoded.hit.size % sizeof(unsigned int))); - - static_assert(CHAR_BIT == 8, "code written assuming `CHAR_BIT == 8`"); - static_assert(sizeof(unsigned int) == 4, - "code written assuming `sizeof(unsigned int) == 4`"); - const std::size_t offset = encoded.nbits % (CHAR_BIT * sizeof(unsigned int)); - // Number of hit buffer padding bytes. - const std::size_t nhbpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); - - assert(encoded.hit.size + nhbpb == hit_buffer_size(encoded.nbits)); - - const size_t npayload = - encoded.hit.size + nhbpb + encoded.missed.size + encoded.frequencies.size; - unsigned char *const payload = new unsigned char[npayload]; - unsigned char *bufp = payload; - - std::memcpy(bufp, encoded.frequencies.data.get(), encoded.frequencies.size); - bufp += encoded.frequencies.size; - - std::memcpy(bufp, encoded.hit.data.get(), encoded.hit.size); - bufp += encoded.hit.size; - - { - const unsigned char zero{0}; - for (std::size_t i = 0; i < nhbpb; ++i) { - std::memcpy(bufp, &zero, 1); - bufp += 1; - } - } - - std::memcpy(bufp, encoded.missed.data.get(), encoded.missed.size); - bufp += encoded.missed.size; - -#ifndef MGARD_ZSTD - const MemoryBuffer out_data = - compress_memory_z(payload, npayload); -#else - const MemoryBuffer out_data = - compress_memory_zstd(payload, npayload); -#endif - delete[] payload; - bufp = nullptr; - - const std::size_t bufferLen = 3 * sizeof(size_t) + out_data.size; - unsigned char *const buffer = new unsigned char[bufferLen]; - - bufp = buffer; - *(size_t *)bufp = encoded.frequencies.size; - bufp += sizeof(size_t); - - *(size_t *)bufp = encoded.nbits; - bufp += sizeof(size_t); - - *(size_t *)bufp = encoded.missed.size; - bufp += sizeof(size_t); - - { - unsigned char const *const p = out_data.data.get(); - std::copy(p, p + out_data.size, bufp); - } - return MemoryBuffer(buffer, bufferLen); -} - MemoryBuffer compress_memory_huffman_rewritten(long int *const src, const std::size_t srcLen) { - const HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); + const HuffmanEncodedStream encoded = huffman_encoding_rewritten(src, srcLen); assert(not(encoded.hit.size % sizeof(unsigned int))); @@ -358,8 +291,8 @@ MemoryBuffer compress(const pb::Header &header, void *const src, if (srcLen % qts) { throw std::runtime_error("incorrect quantization buffer size"); } - return compress_memory_huffman(reinterpret_cast(src), - srcLen / qts); + return compress_memory_huffman_rewritten(reinterpret_cast(src), + srcLen / qts); } #else throw std::runtime_error("MGARD compiled without ZSTD support"); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 427b2e4546..cfa61b4382 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -18,6 +18,7 @@ set( "src/test_decompose.cpp" "src/test_format.cpp" "src/test_quantize.cpp" + "src/compressors_regression.cpp" "src/test_compressors.cpp" "src/test_CompressedDataset.cpp" "src/test_huffman.cpp" diff --git a/tests/include/compressors_regression.hpp b/tests/include/compressors_regression.hpp new file mode 100644 index 0000000000..fc2b58577d --- /dev/null +++ b/tests/include/compressors_regression.hpp @@ -0,0 +1,21 @@ +#ifndef TESTING_COMPRESSORS_REGRESSION_HPP +#define TESTING_COMPRESSORS_REGRESSION_HPP +//!\file +//!\brief Huffman compression and decompression functions for regression tests. + +#include + +#include "utilities.hpp" + +namespace mgard { + +//! Compress an array using a Huffman tree. +//! +//!\param[in] src Array to be compressed. +//!\param[in] srcLen Size of array (number of elements) to be compressed. +MemoryBuffer compress_memory_huffman(long int *const src, + const std::size_t srcLen); + +} // namespace mgard + +#endif diff --git a/tests/src/compressors_regression.cpp b/tests/src/compressors_regression.cpp new file mode 100644 index 0000000000..740a504528 --- /dev/null +++ b/tests/src/compressors_regression.cpp @@ -0,0 +1,85 @@ +#include "compressors_regression.hpp" + +#include +#include + +#include "compressors.hpp" +#include "huffman.hpp" + +namespace mgard { + +static_assert(CHAR_BIT == 8, "code written assuming `CHAR_BIT == 8`"); + +static_assert(sizeof(unsigned int) == 4, + "code written assuming `sizeof(unsigned int) == 4`"); + +static_assert(sizeof(std::size_t) == 8, + "code written assuming `sizeof(std::size_t) == 8`"); + +// This code also makes endianness assumptions. + +MemoryBuffer compress_memory_huffman(long int *const src, + const std::size_t srcLen) { + HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); + + assert(not(encoded.hit.size % sizeof(unsigned int))); + + const std::size_t offset = encoded.nbits % (CHAR_BIT * sizeof(unsigned int)); + // Number of hit buffer padding bytes. + const std::size_t nhbpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); + + assert(encoded.hit.size + nhbpb == hit_buffer_size(encoded.nbits)); + + const std::size_t npayload = + encoded.hit.size + nhbpb + encoded.missed.size + encoded.frequencies.size; + unsigned char *const payload = new unsigned char[npayload]; + unsigned char *bufp = payload; + + std::memcpy(bufp, encoded.frequencies.data.get(), encoded.frequencies.size); + bufp += encoded.frequencies.size; + + std::memcpy(bufp, encoded.hit.data.get(), encoded.hit.size); + bufp += encoded.hit.size; + + { + const unsigned char zero{0}; + for (std::size_t i = 0; i < nhbpb; ++i) { + std::memcpy(bufp, &zero, 1); + bufp += 1; + } + } + + std::memcpy(bufp, encoded.missed.data.get(), encoded.missed.size); + bufp += encoded.missed.size; + +#ifndef MGARD_ZSTD + const MemoryBuffer out_data = + compress_memory_z(payload, npayload); +#else + const MemoryBuffer out_data = + compress_memory_zstd(payload, npayload); +#endif + delete[] payload; + bufp = nullptr; + + const std::size_t bufferLen = 3 * sizeof(std::size_t) + out_data.size; + unsigned char *const buffer = new unsigned char[bufferLen]; + + bufp = buffer; + *(std::size_t *)bufp = encoded.frequencies.size; + bufp += sizeof(std::size_t); + + *(std::size_t *)bufp = encoded.nbits; + bufp += sizeof(std::size_t); + + *(std::size_t *)bufp = encoded.missed.size; + bufp += sizeof(std::size_t); + + { + unsigned char const *const p = out_data.data.get(); + std::copy(p, p + out_data.size, bufp); + } + return MemoryBuffer(buffer, bufferLen); +} + +} // namespace mgard diff --git a/tests/src/test_compressors.cpp b/tests/src/test_compressors.cpp index 74da33a86b..11f7ff3acb 100644 --- a/tests/src/test_compressors.cpp +++ b/tests/src/test_compressors.cpp @@ -7,6 +7,7 @@ #include #include "compressors.hpp" +#include "compressors_regression.hpp" #include "format.hpp" #include "testing_utilities.hpp" From e5eb83ffc40dc0a1b4f96e5492ec35197d0acd0d Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 7 Jun 2022 15:39:50 -0400 Subject: [PATCH 28/58] Add Huffman decompression regression tests. --- tests/src/test_compressors.cpp | 75 ++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tests/src/test_compressors.cpp b/tests/src/test_compressors.cpp index 11f7ff3acb..391bc46de3 100644 --- a/tests/src/test_compressors.cpp +++ b/tests/src/test_compressors.cpp @@ -49,6 +49,34 @@ void test_huffman_compression_regression(long int *const src, delete[] src_; } +void test_huffman_decompression_regression(long int *const src, + const std::size_t srcLen) { + long int *const src_ = new long int[srcLen]; + std::copy(src, src + srcLen, src_); + + const mgard::MemoryBuffer compressed = + mgard::compress_memory_huffman(src, srcLen); + const mgard::MemoryBuffer compressed_ = + mgard::compress_memory_huffman(src_, srcLen); + + delete[] src_; + + mgard::MemoryBuffer out(srcLen); + mgard::MemoryBuffer out_(srcLen); + + unsigned char *const q = compressed.data.get(); + unsigned char *const q_ = compressed_.data.get(); + long int *const p = out.data.get(); + long int *const p_ = out_.data.get(); + + mgard::decompress_memory_huffman(q, compressed.size, p, + out.size * sizeof(long int)); + mgard::decompress_memory_huffman(q_, compressed_.size, p_, + out_.size * sizeof(long int)); + + REQUIRE(std::equal(p, p + srcLen, p_)); +} + void test_hcr_constant(const std::size_t srcLen, const long int q) { long int *const src = new long int[srcLen]; std::fill(src, src + srcLen, q); @@ -73,6 +101,30 @@ void test_hcr_random(const std::size_t srcLen, const long int a, delete[] src; } +void test_hdr_constant(const std::size_t srcLen, const long int q) { + long int *const src = new long int[srcLen]; + std::fill(src, src + srcLen, q); + test_huffman_decompression_regression(src, srcLen); + delete[] src; +} + +void test_hdr_periodic(const std::size_t srcLen, const long int initial, + const std::size_t period) { + long int *const src = new long int[srcLen]; + std::generate(src, src + srcLen, PeriodicGenerator(period, initial)); + test_huffman_decompression_regression(src, srcLen); + delete[] src; +} + +void test_hdr_random(const std::size_t srcLen, const long int a, + const long int b, std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + long int *const src = new long int[srcLen]; + std::generate(src, src + srcLen, [&] { return dis(gen); }); + test_huffman_decompression_regression(src, srcLen); + delete[] src; +} + } // namespace TEST_CASE("Huffman compression regression", "[compressors] [regression]") { @@ -98,6 +150,29 @@ TEST_CASE("Huffman compression regression", "[compressors] [regression]") { } } +TEST_CASE("Huffman decompression regression", "[compressors] [regression]") { + SECTION("constant data") { + test_hdr_constant(4, -143485); + test_hdr_constant(64, 0); + test_hdr_constant(256, 67486); + } + + SECTION("periodic data") { + test_hdr_periodic(10, 0, 3); + test_hdr_periodic(100, -570, 10); + test_hdr_periodic(1000, 394, 19); + } + + SECTION("random data") { + std::default_random_engine gen(566222); + test_hdr_random(100, 1, 2, gen); + test_hdr_random(30, -7, 7, gen); + test_hdr_random(900, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_hdr_random(2700, -60, 40, gen); + } +} + TEST_CASE("Huffman compression", "[compressors] [!mayfail]") { std::default_random_engine gen(257100); const std::size_t n = 5000; From a59ebebe7d7d039916dbb40f616ba60b273f4d3b Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 7 Jun 2022 15:53:01 -0400 Subject: [PATCH 29/58] Add Huffman decompression reimplementation. --- include/compressors.hpp | 9 ++-- src/compressors.cpp | 26 +++++++--- src/huffman.cpp | 2 +- tests/include/compressors_regression.hpp | 10 ++++ tests/src/compressors_regression.cpp | 61 ++++++++++++++++++++++++ tests/src/test_compressors.cpp | 8 ++-- tests/src/test_huffman.cpp | 43 +++++++++-------- 7 files changed, 123 insertions(+), 36 deletions(-) diff --git a/include/compressors.hpp b/include/compressors.hpp index b27865a8be..09f8c53c22 100644 --- a/include/compressors.hpp +++ b/include/compressors.hpp @@ -28,13 +28,16 @@ compress_memory_huffman_rewritten(long int *const src, //! Decompress an array compressed with `compress_memory_huffman`. //! +//!\deprecated +//! //!\param[in] src Compressed array. //!\param[in] srcLen Size in bytes of the compressed array. //!\param[out] dst Decompressed array. //!\param[in] dstLen Size in bytes of the decompressed array. -void decompress_memory_huffman(unsigned char *const src, - const std::size_t srcLen, long int *const dst, - const std::size_t dstLen); +void decompress_memory_huffman_rewritten(unsigned char *const src, + const std::size_t srcLen, + long int *const dst, + const std::size_t dstLen); #ifdef MGARD_ZSTD //! Compress an array using `zstd`. diff --git a/src/compressors.cpp b/src/compressors.cpp index 2096b749e1..f97528d3d6 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -30,9 +30,10 @@ std::size_t hit_buffer_size(const std::size_t nbits) { } // namespace -void decompress_memory_huffman(unsigned char *const src, - const std::size_t srcLen, long int *const dst, - const std::size_t dstLen) { +void decompress_memory_huffman_rewritten(unsigned char *const src, + const std::size_t srcLen, + long int *const dst, + const std::size_t dstLen) { std::size_t const *const sizes = reinterpret_cast(src); const std::size_t nfrequencies = sizes[0]; const std::size_t nbits = sizes[1]; @@ -54,7 +55,12 @@ void decompress_memory_huffman(unsigned char *const src, #endif } - HuffmanEncodedStream encoded(nbits, nhit, nmissed, nfrequencies); + // `huffman_decoding_rewritten` expects the size of the hit buffer to be a + // multiple of `sizeof(unsigned int)`. We'll zero out any extra bytes below. + const std::size_t nbytes = + sizeof(unsigned int) * + ((nhit + sizeof(unsigned int) - 1) / sizeof(unsigned int)); + HuffmanEncodedStream encoded(nbits, nbytes, nmissed, nfrequencies); { unsigned char const *begin; unsigned char const *end; @@ -67,12 +73,17 @@ void decompress_memory_huffman(unsigned char *const src, end = begin + nhit; std::copy(begin, end, encoded.hit.data.get()); + { + unsigned char *const p = encoded.hit.data.get(); + std::fill(p + nhit, p + nbytes, 0); + } + begin = end; end = begin + nmissed; std::copy(begin, end, encoded.missed.data.get()); } - const MemoryBuffer decoded = huffman_decoding(encoded); + const MemoryBuffer decoded = huffman_decoding_rewritten(encoded); { long int const *const p = decoded.data.get(); if (decoded.size * sizeof(*p) != dstLen) { @@ -325,8 +336,9 @@ void decompress(const pb::Header &header, void *const src, break; case pb::Encoding::CPU_HUFFMAN_ZSTD: #ifdef MGARD_ZSTD - decompress_memory_huffman(static_cast(src), srcLen, - static_cast(dst), dstLen); + decompress_memory_huffman_rewritten(static_cast(src), + srcLen, static_cast(dst), + dstLen); break; #else throw std::runtime_error("MGARD compiled without ZSTD support"); diff --git a/src/huffman.cpp b/src/huffman.cpp index 8acf634c26..abe7000b29 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -535,7 +535,7 @@ MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { start_bit += len; } - assert(start_bit == out_data_hit_size); + assert(start_bit == encoded.nbits); assert(sizeof(int) * num_missed == out_data_miss_size); delete[] miss_buf; diff --git a/tests/include/compressors_regression.hpp b/tests/include/compressors_regression.hpp index fc2b58577d..cc1815a3a7 100644 --- a/tests/include/compressors_regression.hpp +++ b/tests/include/compressors_regression.hpp @@ -16,6 +16,16 @@ namespace mgard { MemoryBuffer compress_memory_huffman(long int *const src, const std::size_t srcLen); +//! Decompress an array compressed with `compress_memory_huffman`. +//! +//!\param[in] src Compressed array. +//!\param[in] srcLen Size in bytes of the compressed array. +//!\param[out] dst Decompressed array. +//!\param[in] dstLen Size in bytes of the decompressed array. +void decompress_memory_huffman(unsigned char *const src, + const std::size_t srcLen, long int *const dst, + const std::size_t dstLen); + } // namespace mgard #endif diff --git a/tests/src/compressors_regression.cpp b/tests/src/compressors_regression.cpp index 740a504528..eb4d53761b 100644 --- a/tests/src/compressors_regression.cpp +++ b/tests/src/compressors_regression.cpp @@ -16,6 +16,14 @@ static_assert(sizeof(unsigned int) == 4, static_assert(sizeof(std::size_t) == 8, "code written assuming `sizeof(std::size_t) == 8`"); +namespace { + +std::size_t hit_buffer_size(const std::size_t nbits) { + return nbits / CHAR_BIT + sizeof(unsigned int); +} + +} // namespace + // This code also makes endianness assumptions. MemoryBuffer compress_memory_huffman(long int *const src, @@ -82,4 +90,57 @@ MemoryBuffer compress_memory_huffman(long int *const src, return MemoryBuffer(buffer, bufferLen); } +void decompress_memory_huffman(unsigned char *const src, + const std::size_t srcLen, long int *const dst, + const std::size_t dstLen) { + std::size_t const *const sizes = reinterpret_cast(src); + const std::size_t nfrequencies = sizes[0]; + const std::size_t nbits = sizes[1]; + const std::size_t nmissed = sizes[2]; + const std::size_t nhit = hit_buffer_size(nbits); + + MemoryBuffer buffer(nfrequencies + nhit + nmissed); + { + const std::size_t offset = 3 * sizeof(std::size_t); + unsigned char const *const src_ = src + offset; + const std::size_t srcLen_ = srcLen - offset; + unsigned char *const dst_ = buffer.data.get(); + const std::size_t dstLen_ = buffer.size; + +#ifndef MGARD_ZSTD + decompress_memory_z(src_, srcLen_, dst_, dstLen_); +#else + decompress_memory_zstd(src_, srcLen_, dst_, dstLen_); +#endif + } + + HuffmanEncodedStream encoded(nbits, nhit, nmissed, nfrequencies); + { + unsigned char const *begin; + unsigned char const *end; + + begin = buffer.data.get(); + end = begin + nfrequencies; + std::copy(begin, end, encoded.frequencies.data.get()); + + begin = end; + end = begin + nhit; + std::copy(begin, end, encoded.hit.data.get()); + + begin = end; + end = begin + nmissed; + std::copy(begin, end, encoded.missed.data.get()); + } + + const MemoryBuffer decoded = huffman_decoding(encoded); + { + long int const *const p = decoded.data.get(); + if (decoded.size * sizeof(*p) != dstLen) { + throw std::runtime_error( + "mismatch between expected and obtained decompressed buffer sizes"); + } + std::copy(p, p + decoded.size, dst); + } +} + } // namespace mgard diff --git a/tests/src/test_compressors.cpp b/tests/src/test_compressors.cpp index 391bc46de3..af9de92915 100644 --- a/tests/src/test_compressors.cpp +++ b/tests/src/test_compressors.cpp @@ -41,12 +41,12 @@ void test_huffman_compression_regression(long int *const src, const mgard::MemoryBuffer out_ = mgard::compress_memory_huffman_rewritten(src_, srcLen); + delete[] src_; + REQUIRE(out.size == out_.size); unsigned char const *const p = out.data.get(); unsigned char const *const p_ = out_.data.get(); REQUIRE(std::equal(p, p + out.size, p_)); - - delete[] src_; } void test_huffman_decompression_regression(long int *const src, @@ -71,8 +71,8 @@ void test_huffman_decompression_regression(long int *const src, mgard::decompress_memory_huffman(q, compressed.size, p, out.size * sizeof(long int)); - mgard::decompress_memory_huffman(q_, compressed_.size, p_, - out_.size * sizeof(long int)); + mgard::decompress_memory_huffman_rewritten(q_, compressed_.size, p_, + out_.size * sizeof(long int)); REQUIRE(std::equal(p, p + srcLen, p_)); } diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index 5fed4bedec..d984c8caf9 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -12,51 +12,52 @@ namespace { void test_encoding_regression(long int *const quantized, const std::size_t N) { - long int *const quantized_new = new long int[N]; - std::copy(quantized, quantized + N, quantized_new); + long int *const quantized_ = new long int[N]; + std::copy(quantized, quantized + N, quantized_); const mgard::HuffmanEncodedStream out = mgard::huffman_encoding(quantized, N); - const mgard::HuffmanEncodedStream out_new = - mgard::huffman_encoding_rewritten(quantized_new, N); + const mgard::HuffmanEncodedStream out_ = + mgard::huffman_encoding_rewritten(quantized_, N); unsigned char const *const hit = out.hit.data.get(); - REQUIRE(out_new.nbits == out.nbits); + REQUIRE(out_.nbits == out.nbits); const std::size_t nbytes = (out.nbits + CHAR_BIT - 1) / CHAR_BIT; - REQUIRE(std::equal(hit, hit + nbytes, out_new.hit.data.get())); + REQUIRE(std::equal(hit, hit + nbytes, out_.hit.data.get())); unsigned char const *const missed = out.missed.data.get(); const std::size_t nmissed = out.missed.size; - REQUIRE(out_new.missed.size == nmissed); - REQUIRE(std::equal(missed, missed + nmissed, out_new.missed.data.get())); + REQUIRE(out_.missed.size == nmissed); + REQUIRE(std::equal(missed, missed + nmissed, out_.missed.data.get())); unsigned char const *const frequencies = out.frequencies.data.get(); const std::size_t nfrequencies = out.frequencies.size; - REQUIRE(out_new.frequencies.size == nfrequencies); + REQUIRE(out_.frequencies.size == nfrequencies); REQUIRE(std::equal(frequencies, frequencies + nfrequencies, - out_new.frequencies.data.get())); + out_.frequencies.data.get())); - delete[] quantized_new; + delete[] quantized_; } void test_decoding_regression(long int *const quantized, const std::size_t N) { - long int *const quantized_new = new long int[N]; - std::copy(quantized, quantized + N, quantized_new); + long int *const quantized_ = new long int[N]; + std::copy(quantized, quantized + N, quantized_); const mgard::HuffmanEncodedStream encoded = mgard::huffman_encoding(quantized, N); - const mgard::HuffmanEncodedStream encoded_new = - mgard::huffman_encoding(quantized_new, N); + const mgard::HuffmanEncodedStream encoded_ = + mgard::huffman_encoding(quantized_, N); - delete[] quantized_new; + delete[] quantized_; const mgard::MemoryBuffer out = mgard::huffman_decoding(encoded); - const mgard::MemoryBuffer out_new = - mgard::huffman_decoding_rewritten(encoded); + const mgard::MemoryBuffer out_ = + mgard::huffman_decoding_rewritten(encoded_); - REQUIRE(out.size == out_new.size); + REQUIRE(out.size == out_.size); + REQUIRE(out.size == N); long int const *const p = out.data.get(); - long int const *const p_new = out_new.data.get(); - REQUIRE(std::equal(p, p + out.size, p_new)); + long int const *const p_ = out_.data.get(); + REQUIRE(std::equal(p, p + out.size, p_)); } void test_encoding_regression_constant(const std::size_t N, const long int q) { From cc78644b8ef0ab4253b5892eb9773b9d43acf550 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 8 Jun 2022 09:30:25 -0400 Subject: [PATCH 30/58] Remove `huffman_{en,de}coding` from library. --- include/huffman.hpp | 54 ++-- src/huffman.cpp | 365 -------------------------- tests/CMakeLists.txt | 1 + tests/include/huffman_regression.hpp | 28 ++ tests/src/compressors_regression.cpp | 2 + tests/src/huffman_regression.cpp | 373 +++++++++++++++++++++++++++ tests/src/test_huffman.cpp | 1 + 7 files changed, 430 insertions(+), 394 deletions(-) create mode 100644 tests/include/huffman_regression.hpp create mode 100644 tests/src/huffman_regression.cpp diff --git a/include/huffman.hpp b/include/huffman.hpp index d98f5b27e7..24a4be22af 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -14,6 +14,12 @@ namespace mgard { +//! One more than the number of symbols assigned codewords in the deprecated +//! Huffman encoding and decoding functions. +//! +//!\deprecated +inline constexpr std::size_t nql = 32768 * 4; + //! A stream compressed using a Huffman code. struct HuffmanEncodedStream { //! Constructor. @@ -38,35 +44,6 @@ struct HuffmanEncodedStream { MemoryBuffer frequencies; }; -//! Encode quantized coefficients using a Huffman code. -//! -//!\param[in, out] quantized_data Input buffer (quantized coefficients). This -//! buffer will be changed by the encoding process. -//!\param[in] n Number of symbols (`long int` quantized coefficients) in the -//! input buffer. -HuffmanEncodedStream huffman_encoding(long int *const quantized_data, - const std::size_t n); - -//! Encode quantized coefficients using a Huffman code. -//! -//!\param[in] quantized_data Input buffer (quantized coefficients). -//!\param[in] n Number of symbols (`long int` quantized coefficients) in the -//! input buffer. -HuffmanEncodedStream -huffman_encoding_rewritten(long int const *const quantized_data, - const std::size_t n); - -//! Decode a stream encoded using a Huffman code. -//! -//!\param[in] encoded Input buffer (Huffman-encoded stream). -MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded); - -//! Decode a stream encoded using a Huffman code. -//! -//!\param[in] encoded Input buffer (Huffman-encoded stream). -MemoryBuffer -huffman_decoding_rewritten(const HuffmanEncodedStream &encoded); - //! Codeword (in progress) associated to a node in a Huffman code creation tree. struct HuffmanCodeword { //! Bytes containing the bits of the codeword. @@ -228,6 +205,25 @@ template class HuffmanCode { const HuffmanCodeword codeword); }; +//! Encode quantized coefficients using a Huffman code. +//! +//!\deprecated +//! +//!\param[in] quantized_data Input buffer (quantized coefficients). +//!\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. +HuffmanEncodedStream +huffman_encoding_rewritten(long int const *const quantized_data, + const std::size_t n); + +//! Decode a stream encoded using a Huffman code. +//! +//!\deprecated +//! +//!\param[in] encoded Input buffer (Huffman-encoded stream). +MemoryBuffer +huffman_decoding_rewritten(const HuffmanEncodedStream &encoded); + } // namespace mgard #include "huffman.tpp" diff --git a/src/huffman.cpp b/src/huffman.cpp index abe7000b29..7e4429c6e7 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -16,8 +16,6 @@ namespace mgard { -const int nql = 32768 * 4; - HuffmanEncodedStream::HuffmanEncodedStream(const std::size_t nbits, const std::size_t ncompressed, const std::size_t nmissed, @@ -55,192 +53,6 @@ CodeCreationTreeNode::CodeCreationTreeNode( const std::shared_ptr &right) : count(left->count + right->count), left(left), right(right) {} -//! Node in the Huffman code creation tree. -struct htree_node { - //! Constructor. - //! - //!\param q (Transformed) symbol. - //!\param cnt Number of occurences of the (transformed) symbol in the source. - htree_node(const int q, const std::size_t cnt) - : q(q), cnt(cnt), code(0), len(0), left(nullptr), right(nullptr) {} - - //! (Transformed) symbol. - int q; - - //! Number of occurences of the (transformed) symbol in the source. - std::size_t cnt; - - //! Codeword associated to the (transformed) symbol. - unsigned int code; - - //! Length in bits of the codeword. - std::size_t len; - - //! Left child in the code creation tree. - htree_node *left; - - //! Right child in the code creation tree. - htree_node *right; -}; - -//! Input symbol–Huffman code pair. -struct huffman_codec { - //! (Transformed) symbol. - int q; - - //! Codeword associated to the (transformed) symbol. - unsigned int code; - - //! Length in bits of the codeword. - std::size_t len; -}; - -//! Frequency table and symbol–code mappings for encoding source. -template struct HuffmanCodec { - // The arrays are value-initialized, which leads to each of their elements - // being value-initialized (ultimately zero-initialized). - - //! Input symbol–Huffman code pairs. - std::array codec{}; - - //! Frequency table for encoding source. - std::array frequency_table{}; -}; - -//! Function object for comparing Huffman code creation nodes. -struct LessThanByCnt { - //! Return whether the first node has a larger count than the second. - //! - //!\param lhs First node. - //!\param rhs Second node. - bool operator()(htree_node const *const lhs, - htree_node const *const rhs) const { - return lhs->cnt > rhs->cnt; - } -}; - -template -using my_priority_queue = - std::priority_queue, LessThanByCnt>; - -void initialize_codec(HuffmanCodec &codec, htree_node *const root, - const unsigned int code, const std::size_t len) { - std::array &codewords = codec.codec; - - root->code = code; - root->len = len; - - if (!root->left && !root->right) { - const std::size_t index = root->q; - codewords.at(index) = {root->q, code, len}; - } - - if (root->left) { - initialize_codec(codec, root->left, code << 1, len + 1); - } - - if (root->right) { - initialize_codec(codec, root->right, code << 1 | 0x1, len + 1); - } -} - -my_priority_queue *build_tree(std::size_t const *const cnt) { - my_priority_queue *const phtree = - new my_priority_queue; - for (int i = 0; i < nql; i++) { - if (cnt[i] != 0) { - htree_node *const new_node = new htree_node(i, cnt[i]); - phtree->push(new_node); - } - } - - while (phtree->size() > 1) { - htree_node *const top_node1 = phtree->top(); - phtree->pop(); - htree_node *const top_node2 = phtree->top(); - phtree->pop(); - - htree_node *const new_node = - new htree_node(-1, top_node1->cnt + top_node2->cnt); - new_node->left = top_node1; - new_node->right = top_node2; - phtree->push(new_node); - } - return phtree; -} - -void free_htree_node(htree_node *const node) { - if (node->left) { - free_htree_node(node->left); - node->left = nullptr; - } - - if (node->right) { - free_htree_node(node->right); - node->right = nullptr; - } - - delete node; -} - -void free_tree(my_priority_queue *const phtree) { - if (phtree) { - free_htree_node(phtree->top()); - - phtree->pop(); - - delete phtree; - } -} - -//! Populate the frequency table of a `HuffmanCodec`. -//! -//!\note This function will change the quantized data. -//! -//!\param[in, out] quantized_data Input buffer (quantized coefficients). This -//! buffer will be changed by the codec-building process. -//\param[in] n Number of symbols (`long int` quantized coefficients) in the -//! input buffer. -void initialize_frequency_table(HuffmanCodec &codec, - long int *const quantized_data, - const std::size_t n) { - assert(*std::max_element(codec.frequency_table.begin(), - codec.frequency_table.end()) == 0); - - for (std::size_t i = 0; i < n; i++) { - // Convert quantization level to positive so that counting freq can be - // easily done. Level 0 is reserved a out-of-range flag. - quantized_data[i] = quantized_data[i] + nql / 2; - ++codec.frequency_table[quantized_data[i] > 0 && - quantized_data[i] < - static_cast(nql) - ? quantized_data[i] - : 0]; - } -} - -//! Build a Huffman codec for an input buffer. -//! -//!\param[in, out] quantized_data Input buffer (quantized coefficients). This -//! buffer will be changed by the codec-building process. -//\param[in] n Number of symbols (`long int` quantized coefficients) in the -//! input buffer. -template -HuffmanCodec build_huffman_codec(long int *const quantized_data, - const std::size_t n) { - HuffmanCodec codec; - initialize_frequency_table(codec, quantized_data, n); - - my_priority_queue *const phtree = - build_tree(codec.frequency_table.data()); - - initialize_codec(codec, phtree->top(), 0, 0); - - free_tree(phtree); - - return codec; -} - namespace { void endianness_shuffle(unsigned char *const buffer, const std::size_t nbytes) { @@ -262,90 +74,6 @@ void endianness_shuffle(unsigned char *const buffer, const std::size_t nbytes) { } } // namespace - -HuffmanEncodedStream huffman_encoding(long int *const quantized_data, - const std::size_t n) { - const HuffmanCodec codec = build_huffman_codec(quantized_data, n); - const std::size_t num_miss = codec.frequency_table[0]; - - assert(n >= num_miss); - - std::size_t nnz = 0; - std::size_t nbits = 0; - for (std::size_t i = 0; i < nql; ++i) { - const huffman_codec &codec_ = codec.codec.at(i); - const std::size_t frequency = codec.frequency_table.at(i); - nbits += frequency * codec_.len; - nnz += frequency ? 1 : 0; - } - - const std::size_t nbytes = - sizeof(unsigned int) * ((nbits + CHAR_BIT * sizeof(unsigned int) - 1) / - (CHAR_BIT * sizeof(unsigned int))); - HuffmanEncodedStream out(nbits, nbytes, num_miss * sizeof(int), - 2 * nnz * sizeof(std::size_t)); - - unsigned int *const hit = - reinterpret_cast(out.hit.data.get()); - std::fill(hit, hit + nbytes / sizeof(unsigned int), 0u); - - int *missed = reinterpret_cast(out.missed.data.get()); - - // write frequency table to buffer - std::size_t *const cft = - reinterpret_cast(out.frequencies.data.get()); - std::size_t off = 0; - for (std::size_t i = 0; i < nql; ++i) { - if (codec.frequency_table[i] > 0) { - cft[2 * off] = i; - cft[2 * off + 1] = codec.frequency_table[i]; - off++; - } - } - - std::size_t start_bit = 0; - for (std::size_t i = 0; i < n; i++) { - const int q = quantized_data[i]; - unsigned int code; - std::size_t len; - - if (q > 0 && q < nql) { - // for those that are within the range - code = codec.codec[q].code; - len = codec.codec[q].len; - } else { - // for those that are out of the range, q is set to 0 - code = codec.codec[0].code; - len = codec.codec[0].len; - - *missed++ = q; - } - - // Note that if len == 0, then that means that either the data is all the - // same number or (more likely) all data are outside the quantization - // range. Either way, the code contains no information and is therefore 0 - // bits. - - if (32 - start_bit % 32 < len) { - // current unsigned int cannot hold the code - // copy 32 - start_bit % 32 bits to the current int - // and copy the rest len - (32 - start_bit % 32) to the next int - const std::size_t rshift = len - (32 - start_bit % 32); - const std::size_t lshift = 32 - rshift; - *(hit + start_bit / 32) = (*(hit + start_bit / 32)) | (code >> rshift); - *(hit + start_bit / 32 + 1) = - (*(hit + start_bit / 32 + 1)) | (code << lshift); - } else if (len) { - code = code << (32 - start_bit % 32 - len); - *(hit + start_bit / 32) = (*(hit + start_bit / 32)) | code; - } - // No effect if `len == 0`. - start_bit += len; - } - - return out; -} - namespace { void check_type_sizes() { @@ -451,99 +179,6 @@ huffman_encoding_rewritten(long int const *const quantized_data, return out; } -MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { - const std::size_t out_data_miss_size = encoded.missed.size; - const std::size_t out_tree_size = encoded.frequencies.size; - unsigned char const *const out_data_hit = encoded.hit.data.get(); - unsigned char const *const out_data_miss = encoded.missed.data.get(); - unsigned char const *const out_tree = encoded.frequencies.data.get(); - - std::size_t const *const cft = (std::size_t const *)out_tree; - const std::size_t nnz = out_tree_size / (2 * sizeof(std::size_t)); - // The elements of the array are value-initialized (here, zero-initialized). - std::size_t *const ft = new std::size_t[nql](); - - std::size_t nquantized = 0; - for (std::size_t j = 0; j < nnz; ++j) { - const std::size_t frequency = cft[2 * j + 1]; - nquantized += frequency; - ft[cft[2 * j]] = frequency; - } - - MemoryBuffer out(nquantized); - long int *const quantized_data = out.data.get(); - - my_priority_queue *const phtree = build_tree(ft); - delete[] ft; - - unsigned int const *const buf = (unsigned int const *)out_data_hit; - - // The out_data_miss may not be aligned. Therefore, the code - // here makes a new buffer. - assert(not(out_data_miss_size % sizeof(int))); - int *const miss_buf = new int[out_data_miss_size / sizeof(int)]; - if (out_data_miss_size) { - std::memcpy(miss_buf, out_data_miss, out_data_miss_size); - } - - int const *miss_bufp = miss_buf; - - std::size_t start_bit = 0; - unsigned int mask = 0x80000000; - - long int *q = quantized_data; - std::size_t i = 0; - std::size_t num_missed = 0; - while (q < quantized_data + nquantized) { - htree_node const *root = phtree->top(); - assert(root); - - std::size_t len = 0; - int offset = 0; - while (root->left) { - int flag = *(buf + start_bit / 32 + offset) & mask; - if (!flag) { - root = root->left; - } else { - root = root->right; - } - - len++; - - mask >>= 1; - if (!mask) { - mask = 0x80000000; - offset = 1; - } else { - // offset = 0; - } - } - - if (root->q != 0) { - *q = root->q - nql / 2; - - } else { - *q = *miss_bufp - nql / 2; - - miss_bufp++; - num_missed++; - } - - q++; - i++; - - start_bit += len; - } - - assert(start_bit == encoded.nbits); - assert(sizeof(int) * num_missed == out_data_miss_size); - - delete[] miss_buf; - free_tree(phtree); - - return out; -} - namespace { long int decode(const HuffmanCode &code, diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index cfa61b4382..f625d0a148 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -21,6 +21,7 @@ set( "src/compressors_regression.cpp" "src/test_compressors.cpp" "src/test_CompressedDataset.cpp" + "src/huffman_regression.cpp" "src/test_huffman.cpp" ) diff --git a/tests/include/huffman_regression.hpp b/tests/include/huffman_regression.hpp new file mode 100644 index 0000000000..f10919d2ea --- /dev/null +++ b/tests/include/huffman_regression.hpp @@ -0,0 +1,28 @@ +#ifndef TESTING_HUFFMAN_REGRESSION_HPP +#define TESTING_HUFFMAN_REGRESSION_HPP +//!\file +//!\brief Huffman encoding and decoding functions for regression tests. + +#include + +#include "huffman.hpp" + +namespace mgard { + +//! Encode quantized coefficients using a Huffman code. +//! +//!\param[in, out] quantized_data Input buffer (quantized coefficients). This +//! buffer will be changed by the encoding process. +//!\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. +HuffmanEncodedStream huffman_encoding(long int *const quantized_data, + const std::size_t n); + +//! Decode a stream encoded using a Huffman code. +//! +//!\param[in] encoded Input buffer (Huffman-encoded stream). +MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded); + +} // namespace mgard + +#endif diff --git a/tests/src/compressors_regression.cpp b/tests/src/compressors_regression.cpp index eb4d53761b..c3dfdc9bc2 100644 --- a/tests/src/compressors_regression.cpp +++ b/tests/src/compressors_regression.cpp @@ -4,7 +4,9 @@ #include #include "compressors.hpp" +#include "compressors_regression.hpp" #include "huffman.hpp" +#include "huffman_regression.hpp" namespace mgard { diff --git a/tests/src/huffman_regression.cpp b/tests/src/huffman_regression.cpp new file mode 100644 index 0000000000..1bd397b3b6 --- /dev/null +++ b/tests/src/huffman_regression.cpp @@ -0,0 +1,373 @@ +#include "huffman_regression.hpp" + +#include +#include + +#include +#include + +namespace mgard { + +//! Node in the Huffman code creation tree. +struct htree_node { + //! Constructor. + //! + //!\param q (Transformed) symbol. + //!\param cnt Number of occurences of the (transformed) symbol in the source. + htree_node(const int q, const std::size_t cnt) + : q(q), cnt(cnt), code(0), len(0), left(nullptr), right(nullptr) {} + + //! (Transformed) symbol. + int q; + + //! Number of occurences of the (transformed) symbol in the source. + std::size_t cnt; + + //! Codeword associated to the (transformed) symbol. + unsigned int code; + + //! Length in bits of the codeword. + std::size_t len; + + //! Left child in the code creation tree. + htree_node *left; + + //! Right child in the code creation tree. + htree_node *right; +}; + +//! Input symbol–Huffman code pair. +struct huffman_codec { + //! (Transformed) symbol. + int q; + + //! Codeword associated to the (transformed) symbol. + unsigned int code; + + //! Length in bits of the codeword. + std::size_t len; +}; + +//! Frequency table and symbol–code mappings for encoding source. +template struct HuffmanCodec { + // The arrays are value-initialized, which leads to each of their elements + // being value-initialized (ultimately zero-initialized). + + //! Input symbol–Huffman code pairs. + std::array codec{}; + + //! Frequency table for encoding source. + std::array frequency_table{}; +}; + +//! Function object for comparing Huffman code creation nodes. +struct LessThanByCnt { + //! Return whether the first node has a larger count than the second. + //! + //!\param lhs First node. + //!\param rhs Second node. + bool operator()(htree_node const *const lhs, + htree_node const *const rhs) const { + return lhs->cnt > rhs->cnt; + } +}; + +template +using my_priority_queue = + std::priority_queue, LessThanByCnt>; + +void initialize_codec(HuffmanCodec &codec, htree_node *const root, + const unsigned int code, const std::size_t len) { + std::array &codewords = codec.codec; + + root->code = code; + root->len = len; + + if (!root->left && !root->right) { + const std::size_t index = root->q; + codewords.at(index) = {root->q, code, len}; + } + + if (root->left) { + initialize_codec(codec, root->left, code << 1, len + 1); + } + + if (root->right) { + initialize_codec(codec, root->right, code << 1 | 0x1, len + 1); + } +} + +my_priority_queue *build_tree(std::size_t const *const cnt) { + my_priority_queue *const phtree = + new my_priority_queue; + for (std::size_t i = 0; i < nql; i++) { + if (cnt[i] != 0) { + htree_node *const new_node = new htree_node(i, cnt[i]); + phtree->push(new_node); + } + } + + while (phtree->size() > 1) { + htree_node *const top_node1 = phtree->top(); + phtree->pop(); + htree_node *const top_node2 = phtree->top(); + phtree->pop(); + + htree_node *const new_node = + new htree_node(-1, top_node1->cnt + top_node2->cnt); + new_node->left = top_node1; + new_node->right = top_node2; + phtree->push(new_node); + } + return phtree; +} + +void free_htree_node(htree_node *const node) { + if (node->left) { + free_htree_node(node->left); + node->left = nullptr; + } + + if (node->right) { + free_htree_node(node->right); + node->right = nullptr; + } + + delete node; +} + +void free_tree(my_priority_queue *const phtree) { + if (phtree) { + free_htree_node(phtree->top()); + + phtree->pop(); + + delete phtree; + } +} + +//! Populate the frequency table of a `HuffmanCodec`. +//! +//!\note This function will change the quantized data. +//! +//!\param[in, out] quantized_data Input buffer (quantized coefficients). This +//! buffer will be changed by the codec-building process. +//\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. +void initialize_frequency_table(HuffmanCodec &codec, + long int *const quantized_data, + const std::size_t n) { + assert(*std::max_element(codec.frequency_table.begin(), + codec.frequency_table.end()) == 0); + + for (std::size_t i = 0; i < n; i++) { + // Convert quantization level to positive so that counting freq can be + // easily done. Level 0 is reserved a out-of-range flag. + quantized_data[i] = quantized_data[i] + nql / 2; + ++codec.frequency_table[quantized_data[i] > 0 && + quantized_data[i] < + static_cast(nql) + ? quantized_data[i] + : 0]; + } +} + +//! Build a Huffman codec for an input buffer. +//! +//!\param[in, out] quantized_data Input buffer (quantized coefficients). This +//! buffer will be changed by the codec-building process. +//\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. +template +HuffmanCodec build_huffman_codec(long int *const quantized_data, + const std::size_t n) { + HuffmanCodec codec; + initialize_frequency_table(codec, quantized_data, n); + + my_priority_queue *const phtree = + build_tree(codec.frequency_table.data()); + + initialize_codec(codec, phtree->top(), 0, 0); + + free_tree(phtree); + + return codec; +} + +HuffmanEncodedStream huffman_encoding(long int *const quantized_data, + const std::size_t n) { + const HuffmanCodec codec = build_huffman_codec(quantized_data, n); + const std::size_t num_miss = codec.frequency_table[0]; + + assert(n >= num_miss); + + std::size_t nnz = 0; + std::size_t nbits = 0; + for (std::size_t i = 0; i < nql; ++i) { + const huffman_codec &codec_ = codec.codec.at(i); + const std::size_t frequency = codec.frequency_table.at(i); + nbits += frequency * codec_.len; + nnz += frequency ? 1 : 0; + } + + const std::size_t nbytes = + sizeof(unsigned int) * ((nbits + CHAR_BIT * sizeof(unsigned int) - 1) / + (CHAR_BIT * sizeof(unsigned int))); + HuffmanEncodedStream out(nbits, nbytes, num_miss * sizeof(int), + 2 * nnz * sizeof(std::size_t)); + + unsigned int *const hit = + reinterpret_cast(out.hit.data.get()); + std::fill(hit, hit + nbytes / sizeof(unsigned int), 0u); + + int *missed = reinterpret_cast(out.missed.data.get()); + + // write frequency table to buffer + std::size_t *const cft = + reinterpret_cast(out.frequencies.data.get()); + std::size_t off = 0; + for (std::size_t i = 0; i < nql; ++i) { + if (codec.frequency_table[i] > 0) { + cft[2 * off] = i; + cft[2 * off + 1] = codec.frequency_table[i]; + off++; + } + } + + std::size_t start_bit = 0; + for (std::size_t i = 0; i < n; i++) { + const int q = quantized_data[i]; + unsigned int code; + std::size_t len; + + if (q > 0 && q < static_cast(nql)) { + // for those that are within the range + code = codec.codec[q].code; + len = codec.codec[q].len; + } else { + // for those that are out of the range, q is set to 0 + code = codec.codec[0].code; + len = codec.codec[0].len; + + *missed++ = q; + } + + // Note that if len == 0, then that means that either the data is all the + // same number or (more likely) all data are outside the quantization + // range. Either way, the code contains no information and is therefore 0 + // bits. + + if (32 - start_bit % 32 < len) { + // current unsigned int cannot hold the code + // copy 32 - start_bit % 32 bits to the current int + // and copy the rest len - (32 - start_bit % 32) to the next int + const std::size_t rshift = len - (32 - start_bit % 32); + const std::size_t lshift = 32 - rshift; + *(hit + start_bit / 32) = (*(hit + start_bit / 32)) | (code >> rshift); + *(hit + start_bit / 32 + 1) = + (*(hit + start_bit / 32 + 1)) | (code << lshift); + } else if (len) { + code = code << (32 - start_bit % 32 - len); + *(hit + start_bit / 32) = (*(hit + start_bit / 32)) | code; + } + // No effect if `len == 0`. + start_bit += len; + } + + return out; +} + +MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { + const std::size_t out_data_miss_size = encoded.missed.size; + const std::size_t out_tree_size = encoded.frequencies.size; + unsigned char const *const out_data_hit = encoded.hit.data.get(); + unsigned char const *const out_data_miss = encoded.missed.data.get(); + unsigned char const *const out_tree = encoded.frequencies.data.get(); + + std::size_t const *const cft = (std::size_t const *)out_tree; + const std::size_t nnz = out_tree_size / (2 * sizeof(std::size_t)); + // The elements of the array are value-initialized (here, zero-initialized). + std::size_t *const ft = new std::size_t[nql](); + + std::size_t nquantized = 0; + for (std::size_t j = 0; j < nnz; ++j) { + const std::size_t frequency = cft[2 * j + 1]; + nquantized += frequency; + ft[cft[2 * j]] = frequency; + } + + MemoryBuffer out(nquantized); + long int *const quantized_data = out.data.get(); + + my_priority_queue *const phtree = build_tree(ft); + delete[] ft; + + unsigned int const *const buf = (unsigned int const *)out_data_hit; + + // The out_data_miss may not be aligned. Therefore, the code + // here makes a new buffer. + assert(not(out_data_miss_size % sizeof(int))); + int *const miss_buf = new int[out_data_miss_size / sizeof(int)]; + if (out_data_miss_size) { + std::memcpy(miss_buf, out_data_miss, out_data_miss_size); + } + + int const *miss_bufp = miss_buf; + + std::size_t start_bit = 0; + unsigned int mask = 0x80000000; + + long int *q = quantized_data; + std::size_t i = 0; + std::size_t num_missed = 0; + while (q < quantized_data + nquantized) { + htree_node const *root = phtree->top(); + assert(root); + + std::size_t len = 0; + int offset = 0; + while (root->left) { + int flag = *(buf + start_bit / 32 + offset) & mask; + if (!flag) { + root = root->left; + } else { + root = root->right; + } + + len++; + + mask >>= 1; + if (!mask) { + mask = 0x80000000; + offset = 1; + } else { + // offset = 0; + } + } + + if (root->q != 0) { + *q = root->q - nql / 2; + + } else { + *q = *miss_bufp - nql / 2; + + miss_bufp++; + num_missed++; + } + + q++; + i++; + + start_bit += len; + } + + assert(start_bit == encoded.nbits); + assert(sizeof(int) * num_missed == out_data_miss_size); + + delete[] miss_buf; + free_tree(phtree); + + return out; +} + +} // namespace mgard diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index d984c8caf9..44cec46b67 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -8,6 +8,7 @@ #include "testing_utilities.hpp" #include "huffman.hpp" +#include "huffman_regression.hpp" namespace { From 2a4619274c26595defae76bbac0759928207312f Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 8 Jun 2022 09:58:58 -0400 Subject: [PATCH 31/58] Rename reimplemented Huffman functions. --- include/compressors.hpp | 12 ++++------ include/huffman.hpp | 8 +++---- src/compressors.cpp | 29 +++++++++++------------- src/huffman.cpp | 8 +++---- tests/include/compressors_regression.hpp | 4 ++++ tests/include/huffman_regression.hpp | 4 ++++ tests/src/compressors_regression.cpp | 10 ++++++-- tests/src/huffman_regression.cpp | 4 ++++ tests/src/test_compressors.cpp | 22 +++++++++--------- tests/src/test_huffman.cpp | 15 ++++++------ 10 files changed, 63 insertions(+), 53 deletions(-) diff --git a/include/compressors.hpp b/include/compressors.hpp index 09f8c53c22..a8048966fa 100644 --- a/include/compressors.hpp +++ b/include/compressors.hpp @@ -22,9 +22,8 @@ namespace mgard { //! //!\param[in] src Array to be compressed. //!\param[in] srcLen Size of array (number of elements) to be compressed. -MemoryBuffer -compress_memory_huffman_rewritten(long int *const src, - const std::size_t srcLen); +MemoryBuffer compress_memory_huffman(long int *const src, + const std::size_t srcLen); //! Decompress an array compressed with `compress_memory_huffman`. //! @@ -34,10 +33,9 @@ compress_memory_huffman_rewritten(long int *const src, //!\param[in] srcLen Size in bytes of the compressed array. //!\param[out] dst Decompressed array. //!\param[in] dstLen Size in bytes of the decompressed array. -void decompress_memory_huffman_rewritten(unsigned char *const src, - const std::size_t srcLen, - long int *const dst, - const std::size_t dstLen); +void decompress_memory_huffman(unsigned char *const src, + const std::size_t srcLen, long int *const dst, + const std::size_t dstLen); #ifdef MGARD_ZSTD //! Compress an array using `zstd`. diff --git a/include/huffman.hpp b/include/huffman.hpp index 24a4be22af..0c8b0a5b93 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -212,17 +212,15 @@ template class HuffmanCode { //!\param[in] quantized_data Input buffer (quantized coefficients). //!\param[in] n Number of symbols (`long int` quantized coefficients) in the //! input buffer. -HuffmanEncodedStream -huffman_encoding_rewritten(long int const *const quantized_data, - const std::size_t n); +HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, + const std::size_t n); //! Decode a stream encoded using a Huffman code. //! //!\deprecated //! //!\param[in] encoded Input buffer (Huffman-encoded stream). -MemoryBuffer -huffman_decoding_rewritten(const HuffmanEncodedStream &encoded); +MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded); } // namespace mgard diff --git a/src/compressors.cpp b/src/compressors.cpp index f97528d3d6..d31a24a06a 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -30,10 +30,9 @@ std::size_t hit_buffer_size(const std::size_t nbits) { } // namespace -void decompress_memory_huffman_rewritten(unsigned char *const src, - const std::size_t srcLen, - long int *const dst, - const std::size_t dstLen) { +void decompress_memory_huffman(unsigned char *const src, + const std::size_t srcLen, long int *const dst, + const std::size_t dstLen) { std::size_t const *const sizes = reinterpret_cast(src); const std::size_t nfrequencies = sizes[0]; const std::size_t nbits = sizes[1]; @@ -55,8 +54,8 @@ void decompress_memory_huffman_rewritten(unsigned char *const src, #endif } - // `huffman_decoding_rewritten` expects the size of the hit buffer to be a - // multiple of `sizeof(unsigned int)`. We'll zero out any extra bytes below. + // `huffman_decoding` expects the size of the hit buffer to be a multiple of + // `sizeof(unsigned int)`. We'll zero out any extra bytes below. const std::size_t nbytes = sizeof(unsigned int) * ((nhit + sizeof(unsigned int) - 1) / sizeof(unsigned int)); @@ -83,7 +82,7 @@ void decompress_memory_huffman_rewritten(unsigned char *const src, std::copy(begin, end, encoded.missed.data.get()); } - const MemoryBuffer decoded = huffman_decoding_rewritten(encoded); + const MemoryBuffer decoded = huffman_decoding(encoded); { long int const *const p = decoded.data.get(); if (decoded.size * sizeof(*p) != dstLen) { @@ -115,10 +114,9 @@ gather_constituents(const std::vector &constituents) { } // namespace -MemoryBuffer -compress_memory_huffman_rewritten(long int *const src, - const std::size_t srcLen) { - const HuffmanEncodedStream encoded = huffman_encoding_rewritten(src, srcLen); +MemoryBuffer compress_memory_huffman(long int *const src, + const std::size_t srcLen) { + const HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); assert(not(encoded.hit.size % sizeof(unsigned int))); @@ -302,8 +300,8 @@ MemoryBuffer compress(const pb::Header &header, void *const src, if (srcLen % qts) { throw std::runtime_error("incorrect quantization buffer size"); } - return compress_memory_huffman_rewritten(reinterpret_cast(src), - srcLen / qts); + return compress_memory_huffman(reinterpret_cast(src), + srcLen / qts); } #else throw std::runtime_error("MGARD compiled without ZSTD support"); @@ -336,9 +334,8 @@ void decompress(const pb::Header &header, void *const src, break; case pb::Encoding::CPU_HUFFMAN_ZSTD: #ifdef MGARD_ZSTD - decompress_memory_huffman_rewritten(static_cast(src), - srcLen, static_cast(dst), - dstLen); + decompress_memory_huffman(static_cast(src), srcLen, + static_cast(dst), dstLen); break; #else throw std::runtime_error("MGARD compiled without ZSTD support"); diff --git a/src/huffman.cpp b/src/huffman.cpp index 7e4429c6e7..fd5ecd6a33 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -91,9 +91,8 @@ void check_type_sizes() { } // namespace -HuffmanEncodedStream -huffman_encoding_rewritten(long int const *const quantized_data, - const std::size_t n) { +HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, + const std::size_t n) { check_type_sizes(); const std::size_t ncodewords = nql - 1; @@ -194,8 +193,7 @@ long int decode(const HuffmanCode &code, } // namespace -MemoryBuffer -huffman_decoding_rewritten(const HuffmanEncodedStream &encoded) { +MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { check_type_sizes(); using Symbol = long int; diff --git a/tests/include/compressors_regression.hpp b/tests/include/compressors_regression.hpp index cc1815a3a7..bfb2426a5d 100644 --- a/tests/include/compressors_regression.hpp +++ b/tests/include/compressors_regression.hpp @@ -9,6 +9,8 @@ namespace mgard { +namespace regression { + //! Compress an array using a Huffman tree. //! //!\param[in] src Array to be compressed. @@ -26,6 +28,8 @@ void decompress_memory_huffman(unsigned char *const src, const std::size_t srcLen, long int *const dst, const std::size_t dstLen); +} // namespace regression + } // namespace mgard #endif diff --git a/tests/include/huffman_regression.hpp b/tests/include/huffman_regression.hpp index f10919d2ea..d67cd6b4ad 100644 --- a/tests/include/huffman_regression.hpp +++ b/tests/include/huffman_regression.hpp @@ -9,6 +9,8 @@ namespace mgard { +namespace regression { + //! Encode quantized coefficients using a Huffman code. //! //!\param[in, out] quantized_data Input buffer (quantized coefficients). This @@ -23,6 +25,8 @@ HuffmanEncodedStream huffman_encoding(long int *const quantized_data, //!\param[in] encoded Input buffer (Huffman-encoded stream). MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded); +} // namespace regression + } // namespace mgard #endif diff --git a/tests/src/compressors_regression.cpp b/tests/src/compressors_regression.cpp index c3dfdc9bc2..dd98384898 100644 --- a/tests/src/compressors_regression.cpp +++ b/tests/src/compressors_regression.cpp @@ -10,6 +10,8 @@ namespace mgard { +namespace regression { + static_assert(CHAR_BIT == 8, "code written assuming `CHAR_BIT == 8`"); static_assert(sizeof(unsigned int) == 4, @@ -30,7 +32,8 @@ std::size_t hit_buffer_size(const std::size_t nbits) { MemoryBuffer compress_memory_huffman(long int *const src, const std::size_t srcLen) { - HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); + HuffmanEncodedStream encoded = + mgard::regression::huffman_encoding(src, srcLen); assert(not(encoded.hit.size % sizeof(unsigned int))); @@ -134,7 +137,8 @@ void decompress_memory_huffman(unsigned char *const src, std::copy(begin, end, encoded.missed.data.get()); } - const MemoryBuffer decoded = huffman_decoding(encoded); + const MemoryBuffer decoded = + mgard::regression::huffman_decoding(encoded); { long int const *const p = decoded.data.get(); if (decoded.size * sizeof(*p) != dstLen) { @@ -145,4 +149,6 @@ void decompress_memory_huffman(unsigned char *const src, } } +} // namespace regression + } // namespace mgard diff --git a/tests/src/huffman_regression.cpp b/tests/src/huffman_regression.cpp index 1bd397b3b6..5fbc4b74dd 100644 --- a/tests/src/huffman_regression.cpp +++ b/tests/src/huffman_regression.cpp @@ -8,6 +8,8 @@ namespace mgard { +namespace regression { + //! Node in the Huffman code creation tree. struct htree_node { //! Constructor. @@ -370,4 +372,6 @@ MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { return out; } +} // namespace regression + } // namespace mgard diff --git a/tests/src/test_compressors.cpp b/tests/src/test_compressors.cpp index af9de92915..4d5a42048b 100644 --- a/tests/src/test_compressors.cpp +++ b/tests/src/test_compressors.cpp @@ -37,9 +37,9 @@ void test_huffman_compression_regression(long int *const src, std::copy(src, src + srcLen, src_); const mgard::MemoryBuffer out = - mgard::compress_memory_huffman(src, srcLen); + mgard::regression::compress_memory_huffman(src, srcLen); const mgard::MemoryBuffer out_ = - mgard::compress_memory_huffman_rewritten(src_, srcLen); + mgard::compress_memory_huffman(src_, srcLen); delete[] src_; @@ -55,9 +55,9 @@ void test_huffman_decompression_regression(long int *const src, std::copy(src, src + srcLen, src_); const mgard::MemoryBuffer compressed = - mgard::compress_memory_huffman(src, srcLen); + mgard::regression::compress_memory_huffman(src, srcLen); const mgard::MemoryBuffer compressed_ = - mgard::compress_memory_huffman(src_, srcLen); + mgard::regression::compress_memory_huffman(src_, srcLen); delete[] src_; @@ -69,10 +69,10 @@ void test_huffman_decompression_regression(long int *const src, long int *const p = out.data.get(); long int *const p_ = out_.data.get(); - mgard::decompress_memory_huffman(q, compressed.size, p, - out.size * sizeof(long int)); - mgard::decompress_memory_huffman_rewritten(q_, compressed_.size, p_, - out_.size * sizeof(long int)); + mgard::regression::decompress_memory_huffman(q, compressed.size, p, + out.size * sizeof(long int)); + mgard::decompress_memory_huffman(q_, compressed_.size, p_, + out_.size * sizeof(long int)); REQUIRE(std::equal(p, p + srcLen, p_)); } @@ -268,8 +268,8 @@ TEST_CASE("compression with header configuration", "[compressors]") { REQUIRE(e.preprocessor() == mgard::pb::Encoding::SHUFFLE); #ifdef MGARD_ZSTD REQUIRE(e.compressor() == mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); - mgard::decompress_memory_huffman(compressed.data.get(), compressed.size, dst, - quantizedLen); + mgard::regression::decompress_memory_huffman( + compressed.data.get(), compressed.size, dst, quantizedLen); #else REQUIRE(e.compressor() == mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); mgard::decompress_memory_z(compressed.data.get(), compressed.size, dst, @@ -339,7 +339,7 @@ TEST_CASE("decompression with header configuration", "[compressors]") { std::int64_t *const quantized_ = new std::int64_t[ndof]; std::copy(quantized, quantized + ndof, quantized_); const mgard::MemoryBuffer out = - mgard::compress_memory_huffman(quantized_, ndof); + mgard::regression::compress_memory_huffman(quantized_, ndof); delete[] quantized_; const std::size_t srcLen = out.size; diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index 44cec46b67..95eeb1af0b 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -16,9 +16,10 @@ void test_encoding_regression(long int *const quantized, const std::size_t N) { long int *const quantized_ = new long int[N]; std::copy(quantized, quantized + N, quantized_); - const mgard::HuffmanEncodedStream out = mgard::huffman_encoding(quantized, N); + const mgard::HuffmanEncodedStream out = + mgard::regression::huffman_encoding(quantized, N); const mgard::HuffmanEncodedStream out_ = - mgard::huffman_encoding_rewritten(quantized_, N); + mgard::huffman_encoding(quantized_, N); unsigned char const *const hit = out.hit.data.get(); REQUIRE(out_.nbits == out.nbits); @@ -44,15 +45,15 @@ void test_decoding_regression(long int *const quantized, const std::size_t N) { std::copy(quantized, quantized + N, quantized_); const mgard::HuffmanEncodedStream encoded = - mgard::huffman_encoding(quantized, N); + mgard::regression::huffman_encoding(quantized, N); const mgard::HuffmanEncodedStream encoded_ = - mgard::huffman_encoding(quantized_, N); + mgard::regression::huffman_encoding(quantized_, N); delete[] quantized_; - const mgard::MemoryBuffer out = mgard::huffman_decoding(encoded); - const mgard::MemoryBuffer out_ = - mgard::huffman_decoding_rewritten(encoded_); + const mgard::MemoryBuffer out = + mgard::regression::huffman_decoding(encoded); + const mgard::MemoryBuffer out_ = mgard::huffman_decoding(encoded_); REQUIRE(out.size == out_.size); REQUIRE(out.size == N); From 7d2825b442ccde216bdc53585184859b26cd130e Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 8 Jun 2022 10:37:58 -0400 Subject: [PATCH 32/58] Copy input buffer in legacy Huffman encoder. --- include/compressors.hpp | 4 ++-- src/compressors.cpp | 4 ++-- tests/include/compressors_regression.hpp | 4 ++-- tests/include/huffman_regression.hpp | 4 +++- tests/src/compressors_regression.cpp | 4 ++-- tests/src/huffman_regression.cpp | 11 ++++++++--- tests/src/test_compressors.cpp | 18 ++++-------------- tests/src/test_huffman.cpp | 20 ++++++-------------- 8 files changed, 29 insertions(+), 40 deletions(-) diff --git a/include/compressors.hpp b/include/compressors.hpp index a8048966fa..1542d3eeb2 100644 --- a/include/compressors.hpp +++ b/include/compressors.hpp @@ -22,7 +22,7 @@ namespace mgard { //! //!\param[in] src Array to be compressed. //!\param[in] srcLen Size of array (number of elements) to be compressed. -MemoryBuffer compress_memory_huffman(long int *const src, +MemoryBuffer compress_memory_huffman(long int const *const src, const std::size_t srcLen); //! Decompress an array compressed with `compress_memory_huffman`. @@ -33,7 +33,7 @@ MemoryBuffer compress_memory_huffman(long int *const src, //!\param[in] srcLen Size in bytes of the compressed array. //!\param[out] dst Decompressed array. //!\param[in] dstLen Size in bytes of the decompressed array. -void decompress_memory_huffman(unsigned char *const src, +void decompress_memory_huffman(unsigned char const *const src, const std::size_t srcLen, long int *const dst, const std::size_t dstLen); diff --git a/src/compressors.cpp b/src/compressors.cpp index d31a24a06a..daa6bc8f2a 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -30,7 +30,7 @@ std::size_t hit_buffer_size(const std::size_t nbits) { } // namespace -void decompress_memory_huffman(unsigned char *const src, +void decompress_memory_huffman(unsigned char const *const src, const std::size_t srcLen, long int *const dst, const std::size_t dstLen) { std::size_t const *const sizes = reinterpret_cast(src); @@ -114,7 +114,7 @@ gather_constituents(const std::vector &constituents) { } // namespace -MemoryBuffer compress_memory_huffman(long int *const src, +MemoryBuffer compress_memory_huffman(long int const *const src, const std::size_t srcLen) { const HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); diff --git a/tests/include/compressors_regression.hpp b/tests/include/compressors_regression.hpp index bfb2426a5d..07f632eec4 100644 --- a/tests/include/compressors_regression.hpp +++ b/tests/include/compressors_regression.hpp @@ -15,7 +15,7 @@ namespace regression { //! //!\param[in] src Array to be compressed. //!\param[in] srcLen Size of array (number of elements) to be compressed. -MemoryBuffer compress_memory_huffman(long int *const src, +MemoryBuffer compress_memory_huffman(long int const *const src, const std::size_t srcLen); //! Decompress an array compressed with `compress_memory_huffman`. @@ -24,7 +24,7 @@ MemoryBuffer compress_memory_huffman(long int *const src, //!\param[in] srcLen Size in bytes of the compressed array. //!\param[out] dst Decompressed array. //!\param[in] dstLen Size in bytes of the decompressed array. -void decompress_memory_huffman(unsigned char *const src, +void decompress_memory_huffman(unsigned char const *const src, const std::size_t srcLen, long int *const dst, const std::size_t dstLen); diff --git a/tests/include/huffman_regression.hpp b/tests/include/huffman_regression.hpp index d67cd6b4ad..e6c00b092f 100644 --- a/tests/include/huffman_regression.hpp +++ b/tests/include/huffman_regression.hpp @@ -13,11 +13,13 @@ namespace regression { //! Encode quantized coefficients using a Huffman code. //! +//! The algorithm modifies the quantized data, so the input buffer is copied. +//! //!\param[in, out] quantized_data Input buffer (quantized coefficients). This //! buffer will be changed by the encoding process. //!\param[in] n Number of symbols (`long int` quantized coefficients) in the //! input buffer. -HuffmanEncodedStream huffman_encoding(long int *const quantized_data, +HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, const std::size_t n); //! Decode a stream encoded using a Huffman code. diff --git a/tests/src/compressors_regression.cpp b/tests/src/compressors_regression.cpp index dd98384898..c65bac9cd1 100644 --- a/tests/src/compressors_regression.cpp +++ b/tests/src/compressors_regression.cpp @@ -30,7 +30,7 @@ std::size_t hit_buffer_size(const std::size_t nbits) { // This code also makes endianness assumptions. -MemoryBuffer compress_memory_huffman(long int *const src, +MemoryBuffer compress_memory_huffman(long int const *const src, const std::size_t srcLen) { HuffmanEncodedStream encoded = mgard::regression::huffman_encoding(src, srcLen); @@ -95,7 +95,7 @@ MemoryBuffer compress_memory_huffman(long int *const src, return MemoryBuffer(buffer, bufferLen); } -void decompress_memory_huffman(unsigned char *const src, +void decompress_memory_huffman(unsigned char const *const src, const std::size_t srcLen, long int *const dst, const std::size_t dstLen) { std::size_t const *const sizes = reinterpret_cast(src); diff --git a/tests/src/huffman_regression.cpp b/tests/src/huffman_regression.cpp index 5fbc4b74dd..c2c58bdc95 100644 --- a/tests/src/huffman_regression.cpp +++ b/tests/src/huffman_regression.cpp @@ -196,9 +196,12 @@ HuffmanCodec build_huffman_codec(long int *const quantized_data, return codec; } -HuffmanEncodedStream huffman_encoding(long int *const quantized_data, +HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, const std::size_t n) { - const HuffmanCodec codec = build_huffman_codec(quantized_data, n); + long int *const quantized_data_ = new long int[n]; + std::copy(quantized_data, quantized_data + n, quantized_data_); + + const HuffmanCodec codec = build_huffman_codec(quantized_data_, n); const std::size_t num_miss = codec.frequency_table[0]; assert(n >= num_miss); @@ -238,7 +241,7 @@ HuffmanEncodedStream huffman_encoding(long int *const quantized_data, std::size_t start_bit = 0; for (std::size_t i = 0; i < n; i++) { - const int q = quantized_data[i]; + const int q = quantized_data_[i]; unsigned int code; std::size_t len; @@ -276,6 +279,8 @@ HuffmanEncodedStream huffman_encoding(long int *const quantized_data, start_bit += len; } + delete[] quantized_data_; + return out; } diff --git a/tests/src/test_compressors.cpp b/tests/src/test_compressors.cpp index 4d5a42048b..501f085039 100644 --- a/tests/src/test_compressors.cpp +++ b/tests/src/test_compressors.cpp @@ -31,17 +31,12 @@ void test_huffman_identity(std::default_random_engine &gen, delete[] decompressed; } -void test_huffman_compression_regression(long int *const src, +void test_huffman_compression_regression(long int const *const src, const std::size_t srcLen) { - long int *const src_ = new long int[srcLen]; - std::copy(src, src + srcLen, src_); - const mgard::MemoryBuffer out = mgard::regression::compress_memory_huffman(src, srcLen); const mgard::MemoryBuffer out_ = - mgard::compress_memory_huffman(src_, srcLen); - - delete[] src_; + mgard::compress_memory_huffman(src, srcLen); REQUIRE(out.size == out_.size); unsigned char const *const p = out.data.get(); @@ -49,17 +44,12 @@ void test_huffman_compression_regression(long int *const src, REQUIRE(std::equal(p, p + out.size, p_)); } -void test_huffman_decompression_regression(long int *const src, +void test_huffman_decompression_regression(long int const *const src, const std::size_t srcLen) { - long int *const src_ = new long int[srcLen]; - std::copy(src, src + srcLen, src_); - const mgard::MemoryBuffer compressed = mgard::regression::compress_memory_huffman(src, srcLen); const mgard::MemoryBuffer compressed_ = - mgard::regression::compress_memory_huffman(src_, srcLen); - - delete[] src_; + mgard::regression::compress_memory_huffman(src, srcLen); mgard::MemoryBuffer out(srcLen); mgard::MemoryBuffer out_(srcLen); diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index 95eeb1af0b..a035be7173 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -12,14 +12,12 @@ namespace { -void test_encoding_regression(long int *const quantized, const std::size_t N) { - long int *const quantized_ = new long int[N]; - std::copy(quantized, quantized + N, quantized_); - +void test_encoding_regression(long int const *const quantized, + const std::size_t N) { const mgard::HuffmanEncodedStream out = mgard::regression::huffman_encoding(quantized, N); const mgard::HuffmanEncodedStream out_ = - mgard::huffman_encoding(quantized_, N); + mgard::huffman_encoding(quantized, N); unsigned char const *const hit = out.hit.data.get(); REQUIRE(out_.nbits == out.nbits); @@ -36,20 +34,14 @@ void test_encoding_regression(long int *const quantized, const std::size_t N) { REQUIRE(out_.frequencies.size == nfrequencies); REQUIRE(std::equal(frequencies, frequencies + nfrequencies, out_.frequencies.data.get())); - - delete[] quantized_; } -void test_decoding_regression(long int *const quantized, const std::size_t N) { - long int *const quantized_ = new long int[N]; - std::copy(quantized, quantized + N, quantized_); - +void test_decoding_regression(long int const *const quantized, + const std::size_t N) { const mgard::HuffmanEncodedStream encoded = mgard::regression::huffman_encoding(quantized, N); const mgard::HuffmanEncodedStream encoded_ = - mgard::regression::huffman_encoding(quantized_, N); - - delete[] quantized_; + mgard::regression::huffman_encoding(quantized, N); const mgard::MemoryBuffer out = mgard::regression::huffman_decoding(encoded); From 4fd5399d47d747e3e260a553499f4d80f3a21d13 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 8 Jun 2022 11:40:09 -0400 Subject: [PATCH 33/58] Directly set `HuffmanCode` endpoints. --- include/huffman.hpp | 20 +++++++------ include/huffman.tpp | 73 +++++++++++++++++---------------------------- src/huffman.cpp | 31 ++++++++++++------- 3 files changed, 58 insertions(+), 66 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 0c8b0a5b93..5890d2fb88 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -18,7 +18,7 @@ namespace mgard { //! Huffman encoding and decoding functions. //! //!\deprecated -inline constexpr std::size_t nql = 32768 * 4; +inline constexpr std::size_t nql = 1 << 17; //! A stream compressed using a Huffman code. struct HuffmanEncodedStream { @@ -110,19 +110,24 @@ template class HuffmanCode { //! Constructor. //! - //!\param ncodewords Number of symbols that will be assigned codewords. + //!\param endpoints Smallest and largest symbols (inclusive) to receive + //! codewords. //!\param begin Beginning of input stream. //!\param end End of output stream. - HuffmanCode(const std::size_t ncodewords, Symbol const *const begin, - Symbol const *const end); + HuffmanCode(const std::pair &endpoints, + Symbol const *const begin, Symbol const *const end); //! Constructor. //! - //!\param ncodewords Number of symbols that will be assigned codewords. + //!\param endpoints Smallest and largest symbols (inclusive) to receive + //! codewords. //!\param pairs Index–frequency pairs for frequency table. - HuffmanCode(const std::size_t ncodewords, + HuffmanCode(const std::pair &endpoints, const std::vector> &pairs); + //! Smallest and largest symbols (inclusive) to receive codewords. + std::pair endpoints; + //! Number of symbols that will be assigned codewords. std::size_t ncodewords; @@ -164,9 +169,6 @@ template class HuffmanCode { Symbol decode(const Node &leaf, Symbol const *&missed) const; private: - //! Smallest and largest symbols (inclusive) to receive codewords. - std::pair endpoints; - //! Set the range of symbols that will be assigned codewords. //! //!\note This function depends on `ncodewords`. diff --git a/include/huffman.tpp b/include/huffman.tpp index 8d6a7cc9f5..6d4b906fc4 100644 --- a/include/huffman.tpp +++ b/include/huffman.tpp @@ -3,7 +3,6 @@ #include #include -#include #include namespace mgard { @@ -15,45 +14,6 @@ operator()(const typename HuffmanCode::Node &a, return a->count > b->count; } -template void HuffmanCode::set_endpoints() { - // Haven't carefully checked what the minimum acceptable value is. - if (not ncodewords) { - throw std::invalid_argument("`ncodewords` must be positive."); - } - const Symbol SYMBOL_MAX = std::numeric_limits::max(); - const Symbol SYMBOL_MIN = std::numeric_limits::min(); - - const std::size_t max_symbol_ = (ncodewords + 1) / 2 - 1; - const std::size_t opp_min_symbol_ = ncodewords / 2; - - // There is surely a better way of doing this. Lots of potential issues with - // directly comparing `opp_min_symbol_` and `-SYMBOL_MIN`. `-SYMBOL_MIN` - // can't necessarily be represented as a `Symbol`, for example. Trying to - // avoid overflows. - std::size_t a = opp_min_symbol_; - Symbol b = SYMBOL_MIN; - while (a) { - a /= 2; - b /= 2; - } - if (not b) { - // Only a "risk" because we haven't actually established that - // `opp_min_symbol_` is greater in magnitude than `SYMBOL_MIN`. - throw std::overflow_error( - "risk that minimum symbol cannot be represented in symbol type"); - } else if (opp_min_symbol_ > SYMBOL_MAX) { - throw std::overflow_error( - "opposite of minimum symbol canont be represented in symbol type"); - } else { - endpoints.first = -static_cast(opp_min_symbol_); - } - - // `opp_min_symbol_` is either equal to or one greater than `max_symbol_`, - // and we checked above that `opp_min_symbol <= SYMBOL_MAX`. So, we know - // that `max_symbol_ <= SYMBOL_MAX` here. - endpoints.second = max_symbol_; -} - template void HuffmanCode::create_code_creation_tree() { // We can't quite use a `ZippedRange` here, I think, because @@ -106,12 +66,33 @@ void HuffmanCode::populate_frequencies( } } +namespace { + +template +std::size_t +ncodewords_from_endpoints(const std::pair &endpoints) { + if (endpoints.first > endpoints.second) { + throw std::invalid_argument( + "maximum symbol must be greater than or equal to minimum symbol"); + } + // The endpoints are inclusive. + // Overflow possible in the subtraction. + const std::size_t ncodewords = endpoints.second - endpoints.first + 1; + // Haven't carefully checked what the minimum acceptable value is. + if (not ncodewords) { + throw std::invalid_argument("`ncodewords` must be positive."); + } + return ncodewords; +} + +} // namespace + template -HuffmanCode::HuffmanCode(const std::size_t ncodewords, +HuffmanCode::HuffmanCode(const std::pair &endpoints, Symbol const *const begin, Symbol const *const end) - : ncodewords(ncodewords), frequencies(ncodewords), codewords(ncodewords) { - set_endpoints(); + : endpoints(endpoints), ncodewords(ncodewords_from_endpoints(endpoints)), + frequencies(ncodewords), codewords(ncodewords) { populate_frequencies(begin, end); create_code_creation_tree(); recursively_set_codewords(queue.top(), {}); @@ -119,10 +100,10 @@ HuffmanCode::HuffmanCode(const std::size_t ncodewords, template HuffmanCode::HuffmanCode( - const std::size_t ncodewords, + const std::pair &endpoints, const std::vector> &pairs) - : ncodewords(ncodewords), frequencies(ncodewords), codewords(ncodewords) { - set_endpoints(); + : endpoints(endpoints), ncodewords(ncodewords_from_endpoints(endpoints)), + frequencies(ncodewords), codewords(ncodewords) { populate_frequencies(pairs); create_code_creation_tree(); recursively_set_codewords(queue.top(), {}); diff --git a/src/huffman.cpp b/src/huffman.cpp index fd5ecd6a33..bc0a4c6a46 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -91,13 +91,21 @@ void check_type_sizes() { } // namespace +namespace { + +const std::pair nql_endpoints{ + -static_cast((nql - 1) / 2), nql / 2 - 1}; +} + HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, const std::size_t n) { check_type_sizes(); - const std::size_t ncodewords = nql - 1; - const HuffmanCode code(ncodewords, quantized_data, - quantized_data + n); + using Symbol = long int; + using MissedSymbol = int; + + const HuffmanCode code(nql_endpoints, quantized_data, + quantized_data + n); std::vector lengths; for (const HuffmanCodeword &codeword : code.codewords) { @@ -114,10 +122,11 @@ HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, "`nbytes` not bumped up to nearest multiple of `unsigned int` size"); } - const std::size_t nnz = ncodewords - std::count(code.frequencies.begin(), - code.frequencies.end(), 0); + const std::size_t nnz = + code.ncodewords - + std::count(code.frequencies.begin(), code.frequencies.end(), 0); - HuffmanEncodedStream out(nbits, nbytes, code.nmissed() * sizeof(int), + HuffmanEncodedStream out(nbits, nbytes, code.nmissed() * sizeof(MissedSymbol), 2 * nnz * sizeof(std::size_t)); // Write frequency table. @@ -125,7 +134,7 @@ HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, std::size_t *p = reinterpret_cast(out.frequencies.data.get()); const std::vector &frequencies = code.frequencies; - for (std::size_t i = 0; i < ncodewords; ++i) { + for (std::size_t i = 0; i < code.ncodewords; ++i) { const std::size_t frequency = frequencies.at(i); if (frequency) { *p++ = i; @@ -141,10 +150,11 @@ HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, } unsigned char *hit = buffer; - int *missed = reinterpret_cast(out.missed.data.get()); + MissedSymbol *missed = + reinterpret_cast(out.missed.data.get()); unsigned char offset = 0; - for (const long int q : PseudoArray(quantized_data, n)) { + for (const Symbol q : PseudoArray(quantized_data, n)) { if (code.out_of_range(q)) { // Remember that `missed` is an `int` rather than a `long int`. *missed++ = q + nql / 2; @@ -213,8 +223,7 @@ MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { } } - const std::size_t ncodewords = nql - 1; - HuffmanCode code(ncodewords, pairs); + HuffmanCode code(nql_endpoints, pairs); MemoryBuffer out(nquantized); Symbol *q = out.data.get(); From 9f112da1950e673b89e53debe40d69b14ae1c40a Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 9 Jun 2022 12:06:40 -0400 Subject: [PATCH 34/58] Fix calculation of `HuffmanCode::ncodewords`. --- include/huffman.hpp | 3 ++- include/huffman.tpp | 15 ++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 5890d2fb88..a3b89fc476 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -128,7 +128,8 @@ template class HuffmanCode { //! Smallest and largest symbols (inclusive) to receive codewords. std::pair endpoints; - //! Number of symbols that will be assigned codewords. + //! Number of symbols that will be assigned codewords (including one for the + //! 'missed' symbol). std::size_t ncodewords; //! Frequencies of the symbols in the input stream. diff --git a/include/huffman.tpp b/include/huffman.tpp index 6d4b906fc4..40cd9e7ff7 100644 --- a/include/huffman.tpp +++ b/include/huffman.tpp @@ -75,13 +75,14 @@ ncodewords_from_endpoints(const std::pair &endpoints) { throw std::invalid_argument( "maximum symbol must be greater than or equal to minimum symbol"); } - // The endpoints are inclusive. - // Overflow possible in the subtraction. - const std::size_t ncodewords = endpoints.second - endpoints.first + 1; - // Haven't carefully checked what the minimum acceptable value is. - if (not ncodewords) { - throw std::invalid_argument("`ncodewords` must be positive."); - } + // One for the 'missed' symbol, and the endpoints are inclusive. + // Overflow is possible in the subtraction `endpoints.second - + // endpoints.first` (suppose `Symbol` is `char` and `endpoints` is `{CHAR_MIN, + // CHAR_MAX}`. Casting to `std::int64_t` should avoid the problem in all + // practical cases. + const std::size_t ncodewords = 1 + + static_cast(endpoints.second) - + static_cast(endpoints.first) + 1; return ncodewords; } From 6d619800504221188282f341370f393709e58787 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 9 Jun 2022 14:03:45 -0400 Subject: [PATCH 35/58] Generalize function to parse header from buffer. --- include/format.hpp | 17 ++++++++++------- include/format.tpp | 22 ++++++++++++++++++++++ src/format.cpp | 24 +----------------------- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/include/format.hpp b/include/format.hpp index e7821e64e6..0a817e78f6 100644 --- a/include/format.hpp +++ b/include/format.hpp @@ -165,16 +165,19 @@ pb::Header read_metadata(BufferWindow &window); //!\param header Header of the self-describing buffer. void write_metadata(std::ostream &ostream, const pb::Header &header); -//! Parse the header of a self-describing buffer. +template +//! Parse a message from a buffer window. //! //! The buffer pointer will be advanced past the header. //! -//!\param window Window into the self-describing buffer. The current position -//! should be the start of the header. -//!\param header_size Size in bytes of the header. -//!\return Header of the self-describing buffer. -pb::Header read_header(BufferWindow &window, - const std::uint_least64_t header_size); +//! This function was originally written to parse the header from a +//! self-describing buffer. +// +//!\param window Buffer window containing the serialized message. The current +//! position should be the start of the message. +//!\param nmessage Size in bytes of the message. +//!\return Parsed message. +T read_message(BufferWindow &window, const std::uint_least64_t nmessage); //! Check that a dataset was compressed with a compatible version of MGARD. //! diff --git a/include/format.tpp b/include/format.tpp index e223235a41..14b33bb77b 100644 --- a/include/format.tpp +++ b/include/format.tpp @@ -61,4 +61,26 @@ template bool big_endian() { return not*reinterpret_cast(&n); } +template +T read_message(BufferWindow &window, const std::uint_least64_t nmessage) { + // The `CodedInputStream` constructor takes an `int`. + if (nmessage > std::numeric_limits::max()) { + throw std::runtime_error("message is too large (size would overflow)"); + } + // Check that the read will stay in the buffer. + unsigned char const *const next = window.next(nmessage); + T message; + google::protobuf::io::CodedInputStream stream( + static_cast(window.current), nmessage); + if (not message.ParseFromCodedStream(&stream)) { + throw std::runtime_error( + "message parsing encountered read or format error"); + } + if (not stream.ConsumedEntireMessage()) { + throw std::runtime_error("part of message left unparsed"); + } + window.current = next; + return message; +} + } // namespace mgard diff --git a/src/format.cpp b/src/format.cpp index f9c4c62aa9..0464e4b791 100644 --- a/src/format.cpp +++ b/src/format.cpp @@ -204,7 +204,7 @@ pb::Header read_metadata(BufferWindow &window) { const uint_least64_t header_size = read_header_size(window); const uint_least32_t header_crc32 = read_header_crc32(window); check_header_crc32(window, header_size, header_crc32); - return read_header(window, header_size); + return read_message(window, header_size); } namespace { @@ -232,28 +232,6 @@ void write_metadata(std::ostream &ostream, const pb::Header &header) { delete[] header_bytes; } -pb::Header read_header(BufferWindow &window, - const std::uint_least64_t header_size) { - // The `CodedInputStream` constructor takes an `int`. - if (header_size > std::numeric_limits::max()) { - throw std::runtime_error("header is too large (size would overflow)"); - } - // Check that the read will stay in the buffer. - unsigned char const *const next = window.next(header_size); - mgard::pb::Header header; - google::protobuf::io::CodedInputStream stream( - static_cast(window.current), - header_size); - if (not header.ParseFromCodedStream(&stream)) { - throw std::runtime_error("header parsing encountered read or format error"); - } - if (not stream.ConsumedEntireMessage()) { - throw std::runtime_error("part of header left unparsed"); - } - window.current = next; - return header; -} - void check_mgard_version(const pb::Header &header) { const pb::VersionNumber &mgard_version = header.mgard_version(); if (mgard_version.major_() > MGARD_VERSION_MAJOR) { From 501183c7fb3eb6530e025423a6d9d6ea33d2b397 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 9 Jun 2022 14:13:12 -0400 Subject: [PATCH 36/58] Add Huffman encoding with protocol buffer header. --- include/huffman.hpp | 20 +++- include/huffman.tpp | 221 +++++++++++++++++++++++++++++++++++- src/huffman.cpp | 20 +++- src/mgard.proto | 71 +++++++++++- tests/src/test_compress.cpp | 2 +- tests/src/test_huffman.cpp | 66 ++++++++++- 6 files changed, 385 insertions(+), 15 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index a3b89fc476..ee8f662e2d 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -158,16 +158,18 @@ template class HuffmanCode { //! Huffman code creation tree. std::priority_queue, HeldCountGreater> queue; + // TODO: Just indicate in return value whether symbol was missed. + //! Decode a codeword (identified by associated leaf) to a symbol. //! //!\pre `leaf` must be a leaf (rather than an interior node) of the code - //! creation tree. + //! creation tree. `It::value_type` must be convertible to `Symbol`. //! //!\param leaf Leaf (associated to a codeword) to decode. //!\param missed Pointer to next out-of-range symbol. If `leaf` is associated //! to the out-of-range codeword, this pointer will be dereferenced and //! incremented. - Symbol decode(const Node &leaf, Symbol const *&missed) const; + template Symbol decode(const Node &leaf, It &missed) const; private: //! Set the range of symbols that will be assigned codewords. @@ -225,6 +227,20 @@ HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, //!\param[in] encoded Input buffer (Huffman-encoded stream). MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded); +//! Encode quantized coefficients using a Huffman code. +//! +//!\param begin Input buffer (quantized coefficients). +//!\param n Number of symbols in the input buffer. +template +MemoryBuffer huffman_encode(Symbol const *const begin, + const std::size_t n); + +//! Decode a stream encoded using a Huffman code. +//! +//!\param encoded Input buffer (Huffman-encoded stream). +template +MemoryBuffer huffman_decode(const MemoryBuffer &buffer); + } // namespace mgard #include "huffman.tpp" diff --git a/include/huffman.tpp b/include/huffman.tpp index 40cd9e7ff7..97ead4c4c9 100644 --- a/include/huffman.tpp +++ b/include/huffman.tpp @@ -1,12 +1,30 @@ #include "utilities.hpp" #include +#include #include +#include +#include +#include #include +#include "format.hpp" + +#include "proto/mgard.pb.h" + namespace mgard { +// Aliases for compound message field types. +namespace { + +using Endpoints = google::protobuf::RepeatedField; +using Missed = google::protobuf::RepeatedField; +using Frequencies = + google::protobuf::Map; + +} // namespace + template bool HuffmanCode::HeldCountGreater:: operator()(const typename HuffmanCode::Node &a, @@ -49,9 +67,10 @@ void HuffmanCode::populate_frequencies(Symbol const *const begin, } template +template Symbol HuffmanCode::decode(const typename HuffmanCode::Node &leaf, - Symbol const *&missed) const { + It &missed) const { const std::ptrdiff_t offset = leaf->codeword - codewords.data(); // If `offset == 0`, this is the leaf corresponding to out-of-range symbols. assert(offset >= 0); @@ -138,4 +157,204 @@ void HuffmanCode::recursively_set_codewords( } } +namespace { + +//! Generate the default symbol endpoints for a Huffman encoder. +template std::pair endpoints(); + +template std::pair extreme_endpoints() { + return {std::numeric_limits::min(), + std::numeric_limits::max()}; +} + +template std::pair capped_endpoints() { + return {-static_cast(1 << 17), static_cast(1 << 17) - 1}; +} + +template <> std::pair endpoints() { + return extreme_endpoints(); +} + +template <> std::pair endpoints() { + return extreme_endpoints(); +} + +template <> std::pair endpoints() { + return capped_endpoints(); +} + +template <> std::pair endpoints() { + return capped_endpoints(); +} + +} // namespace + +template +MemoryBuffer huffman_encode(Symbol const *const begin, + const std::size_t n) { + const HuffmanCode code(endpoints(), begin, begin + n); + + std::vector lengths; + for (const HuffmanCodeword &codeword : code.codewords) { + lengths.push_back(codeword.length); + } + const std::size_t nbits = + std::inner_product(code.frequencies.begin(), code.frequencies.end(), + lengths.begin(), static_cast(0)); + const std::size_t nbytes = (nbits + CHAR_BIT - 1) / CHAR_BIT; + + pb::HuffmanHeader header; + header.set_index_mapping(pb::HuffmanHeader::INCLUSIVE_RANGE); + header.set_codeword_mapping(pb::HuffmanHeader::INDEX_FREQUENCY_PAIRS); + header.set_missed_encoding(pb::HuffmanHeader::LITERAL); + header.set_hit_encoding(pb::HuffmanHeader::RUN_TOGETHER); + + header.add_endpoints(code.endpoints.first); + header.add_endpoints(code.endpoints.second); + header.set_nbits(nbits); + + Frequencies &frequencies = *header.mutable_frequencies(); + { + std::size_t i = 0; + for (const std::size_t frequency : code.frequencies) { + if (frequency) { + frequencies.insert({i, frequency}); + } + ++i; + } + } + + Missed &missed_ = *header.mutable_missed(); + missed_.Resize(code.nmissed(), 0); + Missed::iterator missed = missed_.begin(); + + // Zero-initialize the bytes. + unsigned char *const hit_ = new unsigned char[nbytes](); + unsigned char *hit = hit_; + + unsigned char offset = 0; + for (const Symbol q : PseudoArray(begin, n)) { + if (code.out_of_range(q)) { + *missed++ = q; + } + + const HuffmanCodeword codeword = code.codewords.at(code.index(q)); + std::size_t NREMAINING = codeword.length; + for (unsigned char byte : codeword.bytes) { + // Number of bits of `byte` left to write. + unsigned char nremaining = + std::min(static_cast(CHAR_BIT), NREMAINING); + // Premature, but this will hold when we're done with `byte`. + NREMAINING -= nremaining; + + while (nremaining) { + *hit |= byte >> offset; + // Number of bits of `byte` just written (not cumulative). + const unsigned char nwritten = std::min( + nremaining, static_cast( + static_cast(CHAR_BIT) - offset)); + offset += nwritten; + hit += offset / CHAR_BIT; + offset %= CHAR_BIT; + nremaining -= nwritten; + byte <<= nwritten; + } + } + } + + const std::uint_least64_t nheader = header.ByteSize(); + MemoryBuffer out(HEADER_SIZE_SIZE + nheader + nbytes); + { + unsigned char *p = out.data.get(); + const std::array nheader_ = + serialize_header_size(nheader); + std::copy(nheader_.begin(), nheader_.end(), p); + p += HEADER_SIZE_SIZE; + + header.SerializeToArray(p, nheader); + p += nheader; + + std::copy(hit_, hit_ + nbytes, p); + p += nbytes; + } + + delete[] hit_; + + return out; +} + +template +MemoryBuffer huffman_decode(const MemoryBuffer &buffer) { + BufferWindow window(buffer.data.get(), buffer.size); + const std::uint_least64_t nheader = read_header_size(window); + pb::HuffmanHeader header = read_message(window, nheader); + + if (header.index_mapping() != pb::HuffmanHeader::INCLUSIVE_RANGE) { + throw std::runtime_error("unrecognized Huffman index mapping"); + } + const Endpoints &endpoints_ = header.endpoints(); + if (endpoints_.size() != 2) { + throw std::runtime_error("received an unexpected number of endpoints"); + } + const std::pair endpoints(endpoints_.Get(0), + endpoints_.Get(1)); + + if (header.codeword_mapping() != pb::HuffmanHeader::INDEX_FREQUENCY_PAIRS) { + throw std::runtime_error("unrecognized Huffman codeword mapping"); + } + const Frequencies &frequencies_ = header.frequencies(); + // TODO: Change `HuffmanCode` constructor so it can take a pair of iterators + // dereferencing to (something convertible to) + // `std::pair`s directly. + const std::vector> pairs( + frequencies_.begin(), frequencies_.end()); + + if (header.missed_encoding() != pb::HuffmanHeader::LITERAL) { + throw std::runtime_error("unrecognized Huffman missed buffer encoding"); + } + const Missed &missed_ = header.missed(); + Missed::const_iterator missed = missed_.cbegin(); + + if (header.hit_encoding() != pb::HuffmanHeader::RUN_TOGETHER) { + throw std::runtime_error("unrecognized Huffman hit buffer encoding"); + } + + const std::size_t nbits = header.nbits(); + const std::size_t nbytes = (nbits + CHAR_BIT - 1) / CHAR_BIT; + if (window.current + nbytes != window.end) { + throw std::runtime_error("number of bits in hit buffer inconsistent with " + "number of bytes in hit buffer"); + } + + const HuffmanCode code(endpoints, pairs); + // TODO: Maybe add a member function for this. + const std::size_t nout = + std::accumulate(code.frequencies.begin(), code.frequencies.end(), + static_cast(0)); + MemoryBuffer out(nout); + Symbol *q = out.data.get(); + + const Bits bits(window.current, window.current + nbits / CHAR_BIT, + nbits % CHAR_BIT); + std::size_t nbits_read = 0; + const typename HuffmanCode::Node root = code.queue.top(); + assert(root); + Bits::iterator b = bits.begin(); + for (std::size_t i = 0; i < nout; ++i) { + typename HuffmanCode::Node node; + for (node = root; node->left; + node = *b++ ? node->right : node->left, ++nbits_read) + ; + // TODO: Make sure `HuffmanCode::decode` can properly take `missed` (not + // relying on `google::protobuf::uint64` being the same as `std::size_t` or + // anything). + const Symbol decoded = code.decode(node, missed); + *q++ = decoded; + } + assert(nbits_read == nbits); + assert(missed == missed_.cend()); + + return out; +} + } // namespace mgard diff --git a/src/huffman.cpp b/src/huffman.cpp index bc0a4c6a46..2e16de3e96 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -190,9 +190,21 @@ HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, namespace { -long int decode(const HuffmanCode &code, - const typename HuffmanCode::Node &leaf, - long int const *&missed) { +//! Decode a codeword (identified by associated leaf) to a symbol and shift. +//! +//!\pre `leaf` must be a leaf (rather than an interior node) of the code +//! creation tree. +//! +//!\deprecated +//! +//!\param code Code containing the code creation tree. +//!\param leaf Leaf (associated to a codeword) to decode. +//!\param missed Pointer to next out-of-range symbol. If `leaf` is associated +//! to the out-of-range codeword, this pointer will be dereferenced and +//! incremented. +long int decode_and_shift(const HuffmanCode &code, + const typename HuffmanCode::Node &leaf, + long int const *&missed) { long int const *const start = missed; long int decoded = code.decode(leaf, missed); if (missed != start) { @@ -257,7 +269,7 @@ MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { for (node = root; node->left; node = *b++ ? node->right : node->left, ++nbits) ; - *q++ = decode(code, node, p_missed); + *q++ = decode_and_shift(code, node, p_missed); } assert(nbits == encoded.nbits); assert(sizeof(MissedSymbol) * (p_missed - missed) == encoded.missed.size); diff --git a/src/mgard.proto b/src/mgard.proto index f5c6fd6fa9..a96fd67c4f 100644 --- a/src/mgard.proto +++ b/src/mgard.proto @@ -14,10 +14,14 @@ message CartesianGridTopology { repeated uint64 shape = 2; } -message ExplicitCubeGeometry { repeated double coordinates = 2; } +message ExplicitCubeGeometry { + repeated double coordinates = 2; +} message Domain { - enum Topology { CARTESIAN_GRID = 0; } + enum Topology { + CARTESIAN_GRID = 0; + } enum Geometry { UNIT_CUBE = 0; EXPLICIT_CUBE = 1; @@ -78,7 +82,9 @@ message DomainDecomposition { } message FunctionDecomposition { - enum Transform { MULTILEVEL_COEFFICIENTS = 0; } + enum Transform { + MULTILEVEL_COEFFICIENTS = 0; + } enum Hierarchy { POWER_OF_TWO_PLUS_ONE = 0; MULTIDIMENSION_WITH_GHOST_NODES = 1; @@ -92,7 +98,9 @@ message FunctionDecomposition { } message Quantization { - enum Method { COEFFICIENTWISE_LINEAR = 0; } + enum Method { + COEFFICIENTWISE_LINEAR = 0; + } enum BinWidths { PER_COEFFICIENT = 0; PER_LEVEL = 1; @@ -123,13 +131,64 @@ message Encoding { X_HUFFMAN_LZ4 = 4; X_HUFFMAN_ZSTD = 5; } + enum HuffmanSerialization { + // Original method, with 'raw' buffer serialization. + DEPRECATED = 0; + // Symbol range, frequency table, missed table, and hit buffer. + RFMH = 1; + } Preprocessor preprocessor = 1; Compressor compressor = 2; - // Only relevant when `compressor == X_HUFFMAN` or `lossless_compressor == - // X_HUFFMAN_LZ4` or `compressor == X_HUFFMAN_ZSTD` + // Only relevant when `compressor == X_HUFFMAN` or `compressor == + // X_HUFFMAN_LZ4` or `compressor == X_HUFFMAN_ZSTD`. uint64 huffman_dictionary_size = 3; uint64 huffman_block_size = 4; + + // Only relevant when `compressor == CPU_HUFFMAN_ZLIB` or + // `compressor == CPU_HUFFMAN_ZSTD`. + HuffmanSerialization serialization = 5; + +} + +message HuffmanHeader { + enum IndexMapping { + // Codewords are (potentially) assigned to the symbols `{min, …, max}`. + // Index `0` is reserved for missed symbols. Then `min` is assigned + // index `1`, `min + 1` is assigned index `2`, and so on. + INCLUSIVE_RANGE = 0; + } + enum CodewordMapping { + // A frequency table is stored as a sequence of index–frequency pairs. + // This table is used to construct a Huffman code creation tree. + INDEX_FREQUENCY_PAIRS = 0; + } + enum MissedEncoding { + // The missed symbols (rather than their indices, for example) are encoded. + LITERAL = 0; + } + enum HitEncoding { + // The codeword bits are run together into a single byte array. + RUN_TOGETHER = 0; + } + + // How each (eligible) symbol is assigned an index. + IndexMapping index_mapping = 1; + // How each (encountered) index is assigned a codeword. + CodewordMapping codeword_mapping = 2; + // How the missed buffer is encoded. + MissedEncoding missed_encoding = 3; + // How the hit (codeword) buffer is encoded. + HitEncoding hit_encoding = 4; + + // Minimum and maximum symbols eligible for codewords. + repeated sint64 endpoints = 5; + // Index–frequency pairs for frequency table. + map frequencies = 6; + // Encountered symbols that were not assigned codewords. + repeated sint64 missed = 7; + // Size of the hit buffer in bits. + uint64 nbits = 8; } message Device { diff --git a/tests/src/test_compress.cpp b/tests/src/test_compress.cpp index ebb41eecfd..b59c05eff0 100644 --- a/tests/src/test_compress.cpp +++ b/tests/src/test_compress.cpp @@ -398,7 +398,7 @@ void test_self_describing_decompression( TEMPLATE_TEST_CASE("decompressing self-describing buffer", "[compress]", float, double) { - std::default_random_engine gen(32094); + std::default_random_engine gen(361656); const std::vector smoothness_parameters = { -1.5, -0.5, 0.0, 0.5, 1.5, std::numeric_limits::infinity()}; const std::vector tolerances = {1, 0.1, 0.01, 0.001}; diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index a035be7173..0921f4538b 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -1,6 +1,8 @@ +#include "catch2/catch_template_test_macros.hpp" #include "catch2/catch_test_macros.hpp" #include +#include #include #include @@ -54,6 +56,15 @@ void test_decoding_regression(long int const *const quantized, REQUIRE(std::equal(p, p + out.size, p_)); } +template void test_inversion(T const *const q, std::size_t N) { + const mgard::MemoryBuffer compressed = + mgard::huffman_encode(q, N); + const mgard::MemoryBuffer decompressed = + mgard::huffman_decode(compressed); + REQUIRE(N == decompressed.size); + REQUIRE(std::equal(q, q + N, decompressed.data.get())); +} + void test_encoding_regression_constant(const std::size_t N, const long int q) { long int *const quantized = new long int[N]; std::fill(quantized, quantized + N, q); @@ -104,6 +115,33 @@ void test_decoding_regression_random(const std::size_t N, const long int a, delete[] quantized; } +template +void test_inversion_constant(const std::size_t N, const T q) { + T *const quantized = new T[N]; + std::fill(quantized, quantized + N, q); + test_inversion(quantized, N); + delete[] quantized; +} + +template +void test_inversion_periodic(const std::size_t N, const T q, + const std::size_t period) { + T *const quantized = new T[N]; + std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); + test_inversion(quantized, N); + delete[] quantized; +} + +template +void test_inversion_random(const std::size_t N, const T a, const T b, + std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + T *const quantized = new T[N]; + std::generate(quantized, quantized + N, [&] { return dis(gen); }); + test_inversion(quantized, N); + delete[] quantized; +} + } // namespace TEST_CASE("encoding regression", "[huffman] [regression]") { @@ -120,7 +158,7 @@ TEST_CASE("encoding regression", "[huffman] [regression]") { } SECTION("random data") { - std::default_random_engine gen(131051); + std::default_random_engine gen(726847); test_encoding_regression_random(10, 0, 1, gen); test_encoding_regression_random(100, -15, -5, gen); test_encoding_regression_random(1000, std::numeric_limits::min(), @@ -151,3 +189,29 @@ TEST_CASE("decoding regression", "[huffman] [regression]") { test_decoding_regression_random(10000, -100, 100, gen); } } + +TEMPLATE_TEST_CASE("Huffman inversion", "[huffman]", std::int8_t, std::int16_t, + std::int32_t, std::int64_t) { + std::default_random_engine gen_(454114); + std::uniform_int_distribution dis; + SECTION("constant data") { + test_inversion_constant(10, dis(gen_)); + test_inversion_constant(100, -dis(gen_)); + test_inversion_constant(1000, dis(gen_)); + } + + SECTION("periodic data") { + test_inversion_periodic(10, -dis(gen_), 11); + test_inversion_periodic(100, dis(gen_), 10); + test_inversion_periodic(1000, -dis(gen_), 9); + } + + SECTION("random data") { + std::default_random_engine gen(950142); + test_inversion_random(10, 0, 1, gen); + test_inversion_random(100, -12, 11, gen); + test_inversion_random(1000, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_inversion_random(10000, -100, 100, gen); + } +} From fec4c94c85ec535ff6e1d0b2ff0d9ba5ed0f2ed2 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 9 Jun 2022 16:56:05 -0400 Subject: [PATCH 37/58] Add static data member for default symbol range. --- include/huffman.hpp | 14 ++++++++---- include/huffman.tpp | 54 ++++++++++++++++++--------------------------- src/huffman.cpp | 12 ++++++++++ 3 files changed, 43 insertions(+), 37 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index ee8f662e2d..7c78b07d67 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -117,6 +117,14 @@ template class HuffmanCode { HuffmanCode(const std::pair &endpoints, Symbol const *const begin, Symbol const *const end); + //! Constructor. + //! + //! The endpoints will be set to `default_endpoints`. + //! + //!\param begin Beginning of input stream. + //!\param end End of output stream. + HuffmanCode(Symbol const *const begin, Symbol const *const end); + //! Constructor. //! //!\param endpoints Smallest and largest symbols (inclusive) to receive @@ -172,10 +180,8 @@ template class HuffmanCode { template Symbol decode(const Node &leaf, It &missed) const; private: - //! Set the range of symbols that will be assigned codewords. - //! - //!\note This function depends on `ncodewords`. - void set_endpoints(); + //! Default symbol range. + const static std::pair default_endpoints; //! Populate the frequency table using a stream of symbols. //! diff --git a/include/huffman.tpp b/include/huffman.tpp index 97ead4c4c9..b4718c814e 100644 --- a/include/huffman.tpp +++ b/include/huffman.tpp @@ -57,6 +57,21 @@ void HuffmanCode::create_code_creation_tree() { } } +// This default will be used for `std::int{8,16}_t` We'll specialize the default +// for `std::int{32,64}_t` in the implementation file. +template +const std::pair HuffmanCode::default_endpoints = { + std::numeric_limits::min(), std::numeric_limits::max()}; + +// I believe these are called 'template specialization declarations.' +template <> +const std::pair + HuffmanCode::default_endpoints; + +template <> +const std::pair + HuffmanCode::default_endpoints; + template void HuffmanCode::populate_frequencies(Symbol const *const begin, Symbol const *const end) { @@ -118,6 +133,11 @@ HuffmanCode::HuffmanCode(const std::pair &endpoints, recursively_set_codewords(queue.top(), {}); } +template +HuffmanCode::HuffmanCode(Symbol const *const begin, + Symbol const *const end) + : HuffmanCode(default_endpoints, begin, end) {} + template HuffmanCode::HuffmanCode( const std::pair &endpoints, @@ -157,42 +177,10 @@ void HuffmanCode::recursively_set_codewords( } } -namespace { - -//! Generate the default symbol endpoints for a Huffman encoder. -template std::pair endpoints(); - -template std::pair extreme_endpoints() { - return {std::numeric_limits::min(), - std::numeric_limits::max()}; -} - -template std::pair capped_endpoints() { - return {-static_cast(1 << 17), static_cast(1 << 17) - 1}; -} - -template <> std::pair endpoints() { - return extreme_endpoints(); -} - -template <> std::pair endpoints() { - return extreme_endpoints(); -} - -template <> std::pair endpoints() { - return capped_endpoints(); -} - -template <> std::pair endpoints() { - return capped_endpoints(); -} - -} // namespace - template MemoryBuffer huffman_encode(Symbol const *const begin, const std::size_t n) { - const HuffmanCode code(endpoints(), begin, begin + n); + const HuffmanCode code(begin, begin + n); std::vector lengths; for (const HuffmanCodeword &codeword : code.codewords) { diff --git a/src/huffman.cpp b/src/huffman.cpp index 2e16de3e96..8331630657 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -280,4 +280,16 @@ MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { return out; } +template <> +const std::pair + HuffmanCode::default_endpoints = { + -static_cast(1 << 17), + static_cast(1 << 17) - 1}; + +template <> +const std::pair + HuffmanCode::default_endpoints = { + -static_cast(1 << 17), + static_cast(1 << 17) - 1}; + } // namespace mgard From ebf3c34adb80b90edb98e43a7eb9e6d48736bc20 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 9 Jun 2022 17:13:48 -0400 Subject: [PATCH 38/58] Separate codeword decoding, missed buffer lookup. --- include/huffman.hpp | 11 ++++------- include/huffman.tpp | 16 ++++++---------- src/huffman.cpp | 8 ++------ 3 files changed, 12 insertions(+), 23 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 7c78b07d67..09326c996c 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -166,18 +166,15 @@ template class HuffmanCode { //! Huffman code creation tree. std::priority_queue, HeldCountGreater> queue; - // TODO: Just indicate in return value whether symbol was missed. - //! Decode a codeword (identified by associated leaf) to a symbol. //! //!\pre `leaf` must be a leaf (rather than an interior node) of the code - //! creation tree. `It::value_type` must be convertible to `Symbol`. + //! creation tree. //! //!\param leaf Leaf (associated to a codeword) to decode. - //!\param missed Pointer to next out-of-range symbol. If `leaf` is associated - //! to the out-of-range codeword, this pointer will be dereferenced and - //! incremented. - template Symbol decode(const Node &leaf, It &missed) const; + //!\return A boolean indicating whether the original symbol was 'hit' and the + //! symbol itself (junk if the original symbol was 'missed'). + std::pair decode(const Node &leaf) const; private: //! Default symbol range. diff --git a/include/huffman.tpp b/include/huffman.tpp index b4718c814e..fdaaf4473c 100644 --- a/include/huffman.tpp +++ b/include/huffman.tpp @@ -82,14 +82,13 @@ void HuffmanCode::populate_frequencies(Symbol const *const begin, } template -template -Symbol -HuffmanCode::decode(const typename HuffmanCode::Node &leaf, - It &missed) const { +std::pair HuffmanCode::decode( + const typename HuffmanCode::Node &leaf) const { const std::ptrdiff_t offset = leaf->codeword - codewords.data(); // If `offset == 0`, this is the leaf corresponding to out-of-range symbols. assert(offset >= 0); - return offset ? endpoints.first + (offset - 1) : *missed++; + return offset ? std::pair(true, endpoints.first + (offset - 1)) + : std::pair(false, {}); } template @@ -333,11 +332,8 @@ MemoryBuffer huffman_decode(const MemoryBuffer &buffer) { for (node = root; node->left; node = *b++ ? node->right : node->left, ++nbits_read) ; - // TODO: Make sure `HuffmanCode::decode` can properly take `missed` (not - // relying on `google::protobuf::uint64` being the same as `std::size_t` or - // anything). - const Symbol decoded = code.decode(node, missed); - *q++ = decoded; + const std::pair decoded = code.decode(node); + *q++ = decoded.first ? decoded.second : *missed++; } assert(nbits_read == nbits); assert(missed == missed_.cend()); diff --git a/src/huffman.cpp b/src/huffman.cpp index 8331630657..2ccd49b8cf 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -205,12 +205,8 @@ namespace { long int decode_and_shift(const HuffmanCode &code, const typename HuffmanCode::Node &leaf, long int const *&missed) { - long int const *const start = missed; - long int decoded = code.decode(leaf, missed); - if (missed != start) { - decoded -= nql / 2; - } - return decoded; + const std::pair pair = code.decode(leaf); + return pair.first ? pair.second : *missed++ - nql / 2; } } // namespace From 1c44399a9fea3e42ee7443e86468841fc837499c Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 9 Jun 2022 17:36:19 -0400 Subject: [PATCH 39/58] =?UTF-8?q?Pass=20index=E2=80=93frequency=20pair=20r?= =?UTF-8?q?ange=20as=20iterator=20pair.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/huffman.hpp | 24 +++++++++++++----------- include/huffman.tpp | 22 +++++++++------------- src/huffman.cpp | 2 +- 3 files changed, 23 insertions(+), 25 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 09326c996c..b23d19b933 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -127,11 +127,16 @@ template class HuffmanCode { //! Constructor. //! + //! `It::value_type` should be (convertible to) + //! `std::pair`. + //! //!\param endpoints Smallest and largest symbols (inclusive) to receive //! codewords. - //!\param pairs Index–frequency pairs for frequency table. - HuffmanCode(const std::pair &endpoints, - const std::vector> &pairs); + //!\param begin Beginning of index–frequency pair range for frequency table. + //!\param end Beginning of index–frequency pair range for frequency table. + template + HuffmanCode(const std::pair &endpoints, const It begin, + const It end); //! Smallest and largest symbols (inclusive) to receive codewords. std::pair endpoints; @@ -191,13 +196,10 @@ template class HuffmanCode { //! Populate the frequency table from a collection of index–frequency pairs. //! - //!\pre `frequencies` should have length `ncodewords` and all entries should - //! be zero. - //! - //!\param pairs Beginning of stream of symbols. - //!\param end End of stream of symbols. - void populate_frequencies( - const std::vector> &pairs); + //!\param begin Beginning of index–frequency pair range. + //!\param end End of index–frequency pair range. + template + void populate_frequencies(const It begin, const It end); //! Create the Huffman code creation tree. //! @@ -240,7 +242,7 @@ MemoryBuffer huffman_encode(Symbol const *const begin, //! Decode a stream encoded using a Huffman code. //! -//!\param encoded Input buffer (Huffman-encoded stream). +//!\param buffer Input buffer (Huffman-encoded stream). template MemoryBuffer huffman_decode(const MemoryBuffer &buffer); diff --git a/include/huffman.tpp b/include/huffman.tpp index fdaaf4473c..904c23e24f 100644 --- a/include/huffman.tpp +++ b/include/huffman.tpp @@ -92,9 +92,9 @@ std::pair HuffmanCode::decode( } template -void HuffmanCode::populate_frequencies( - const std::vector> &pairs) { - for (auto [index, frequency] : pairs) { +template +void HuffmanCode::populate_frequencies(const It begin, const It end) { + for (auto [index, frequency] : RangeSlice{.begin_ = begin, .end_ = end}) { frequencies.at(index) = frequency; } } @@ -138,12 +138,12 @@ HuffmanCode::HuffmanCode(Symbol const *const begin, : HuffmanCode(default_endpoints, begin, end) {} template -HuffmanCode::HuffmanCode( - const std::pair &endpoints, - const std::vector> &pairs) +template +HuffmanCode::HuffmanCode(const std::pair &endpoints, + const It begin, const It end) : endpoints(endpoints), ncodewords(ncodewords_from_endpoints(endpoints)), frequencies(ncodewords), codewords(ncodewords) { - populate_frequencies(pairs); + populate_frequencies(begin, end); create_code_creation_tree(); recursively_set_codewords(queue.top(), {}); } @@ -290,11 +290,6 @@ MemoryBuffer huffman_decode(const MemoryBuffer &buffer) { throw std::runtime_error("unrecognized Huffman codeword mapping"); } const Frequencies &frequencies_ = header.frequencies(); - // TODO: Change `HuffmanCode` constructor so it can take a pair of iterators - // dereferencing to (something convertible to) - // `std::pair`s directly. - const std::vector> pairs( - frequencies_.begin(), frequencies_.end()); if (header.missed_encoding() != pb::HuffmanHeader::LITERAL) { throw std::runtime_error("unrecognized Huffman missed buffer encoding"); @@ -313,7 +308,8 @@ MemoryBuffer huffman_decode(const MemoryBuffer &buffer) { "number of bytes in hit buffer"); } - const HuffmanCode code(endpoints, pairs); + const HuffmanCode code(endpoints, frequencies_.begin(), + frequencies_.end()); // TODO: Maybe add a member function for this. const std::size_t nout = std::accumulate(code.frequencies.begin(), code.frequencies.end(), diff --git a/src/huffman.cpp b/src/huffman.cpp index 2ccd49b8cf..ea5a23bf68 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -231,7 +231,7 @@ MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { } } - HuffmanCode code(nql_endpoints, pairs); + HuffmanCode code(nql_endpoints, pairs.begin(), pairs.end()); MemoryBuffer out(nquantized); Symbol *q = out.data.get(); From 1532c6661ff0e480949e65735dd152e076f778ec Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Fri, 10 Jun 2022 12:48:35 -0400 Subject: [PATCH 40/58] Add function to check quantization buffer size. --- include/format.hpp | 8 ++++++ src/format.cpp | 29 ++++++++++++++++++++++ tests/src/test_format.cpp | 52 +++++++++++++++------------------------ 3 files changed, 57 insertions(+), 32 deletions(-) diff --git a/include/format.hpp b/include/format.hpp index 0a817e78f6..f3b166ffac 100644 --- a/include/format.hpp +++ b/include/format.hpp @@ -66,6 +66,14 @@ serialize_header_crc32(std::uint_least64_t crc32); //!\param p Pointer whose alignment will be checked. template void check_alignment(void const *const p); +//! Check that a quantization buffer has the right alignment and a valid size. +//! +//!\param header Self-describing dataset header. +//!\param p Quantization buffer. +//!\param n Size in bytes of quantization buffer. +void check_quantization_buffer(const pb::Header &header, void const *const p, + const std::size_t n); + //! Determine whether an integral type is big endian. template bool big_endian(); diff --git a/src/format.cpp b/src/format.cpp index 0464e4b791..83b138db81 100644 --- a/src/format.cpp +++ b/src/format.cpp @@ -45,6 +45,35 @@ serialize_header_crc32(std::uint_least64_t crc32) { return serialize(crc32); } +namespace { + +template +void check_quantization_buffer_(void const *const p, const std::size_t n) { + if (n % sizeof(Int)) { + throw std::runtime_error( + "quantization buffer size not a multiple of quantization type size"); + } + check_alignment(p); +} + +} // namespace + +void check_quantization_buffer(const pb::Header &header, void const *const p, + const std::size_t n) { + switch (header.quantization().type()) { + case pb::Quantization::INT8_T: + return check_quantization_buffer_(p, n); + case pb::Quantization::INT16_T: + return check_quantization_buffer_(p, n); + case pb::Quantization::INT32_T: + return check_quantization_buffer_(p, n); + case pb::Quantization::INT64_T: + return check_quantization_buffer_(p, n); + default: + throw std::runtime_error("unrecognized quantization type"); + } +} + template <> pb::Dataset::Type type_to_dataset_type() { return pb::Dataset::FLOAT; } diff --git a/tests/src/test_format.cpp b/tests/src/test_format.cpp index 64943b2421..970eb87fc3 100644 --- a/tests/src/test_format.cpp +++ b/tests/src/test_format.cpp @@ -180,41 +180,29 @@ TEST_CASE("dataset types", "[format]") { REQUIRE(mgard::type_to_dataset_type() == mgard::pb::Dataset::DOUBLE); } -TEST_CASE("quantization type sizes", "[format]") { - mgard::pb::Header header; - mgard::pb::Quantization &quantization = *header.mutable_quantization(); - const std::size_t ndof = 1; - - quantization.set_type(mgard::pb::Quantization::INT8_T); - { - const mgard::MemoryBuffer buffer = - mgard::quantization_buffer(header, ndof); - REQUIRE_NOTHROW(mgard::check_alignment(buffer.data.get())); - REQUIRE(buffer.size == 1); - } +namespace { - quantization.set_type(mgard::pb::Quantization::INT16_T); - { - const mgard::MemoryBuffer buffer = - mgard::quantization_buffer(header, ndof); - REQUIRE_NOTHROW(mgard::check_alignment(buffer.data.get())); - REQUIRE(buffer.size == 2); - } +void test_quantization_buffer(const mgard::pb::Quantization::Type type, + const std::size_t size) { + mgard::pb::Header header; + header.mutable_quantization()->set_type(type); + const mgard::MemoryBuffer buffer = + mgard::quantization_buffer(header, 1); + REQUIRE_NOTHROW( + mgard::check_quantization_buffer(header, buffer.data.get(), buffer.size)); + REQUIRE(buffer.size == size); +} - quantization.set_type(mgard::pb::Quantization::INT32_T); - { - const mgard::MemoryBuffer buffer = - mgard::quantization_buffer(header, ndof); - REQUIRE_NOTHROW(mgard::check_alignment(buffer.data.get())); - REQUIRE(buffer.size == 4); - } +} // namespace - quantization.set_type(mgard::pb::Quantization::INT64_T); - { - const mgard::MemoryBuffer buffer = - mgard::quantization_buffer(header, ndof); - REQUIRE_NOTHROW(mgard::check_alignment(buffer.data.get())); - REQUIRE(buffer.size == 8); +TEST_CASE("quantization buffers", "[format]") { + const std::vector> + pairs{{mgard::pb::Quantization::INT8_T, 1}, + {mgard::pb::Quantization::INT16_T, 2}, + {mgard::pb::Quantization::INT32_T, 4}, + {mgard::pb::Quantization::INT64_T, 8}}; + for (const auto [type, size] : pairs) { + test_quantization_buffer(type, size); } } From 56004d2ee01c4fc9adbc5423c544e41d6211f8cd Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 15 Jun 2022 12:46:19 -0400 Subject: [PATCH 41/58] Automatically calculate Huffman hit buffer size. --- include/huffman.hpp | 5 ++--- src/compressors.cpp | 18 +++++------------- src/huffman.cpp | 27 +++++++++++---------------- tests/src/compressors_regression.cpp | 8 +++++--- tests/src/huffman_regression.cpp | 7 ++----- 5 files changed, 25 insertions(+), 40 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index b23d19b933..97300f91ef 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -25,11 +25,10 @@ struct HuffmanEncodedStream { //! Constructor. //! //!\param nbits Length in bits of the compressed stream. - //!\param ncompressed Length in bytes of the compressed stream. //!\param nmissed Length in bytes of the missed array. //!\param ntable Length in bytes of the frequency table. - HuffmanEncodedStream(const std::size_t nbits, const std::size_t ncompressed, - const std::size_t nmissed, const std::size_t ntable); + HuffmanEncodedStream(const std::size_t nbits, const std::size_t nmissed, + const std::size_t ntable); //! Length in bits of the compressed stream. std::size_t nbits; diff --git a/src/compressors.cpp b/src/compressors.cpp index daa6bc8f2a..f9ff0254d6 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -54,12 +54,7 @@ void decompress_memory_huffman(unsigned char const *const src, #endif } - // `huffman_decoding` expects the size of the hit buffer to be a multiple of - // `sizeof(unsigned int)`. We'll zero out any extra bytes below. - const std::size_t nbytes = - sizeof(unsigned int) * - ((nhit + sizeof(unsigned int) - 1) / sizeof(unsigned int)); - HuffmanEncodedStream encoded(nbits, nbytes, nmissed, nfrequencies); + HuffmanEncodedStream encoded(nbits, nmissed, nfrequencies); { unsigned char const *begin; unsigned char const *end; @@ -69,15 +64,12 @@ void decompress_memory_huffman(unsigned char const *const src, std::copy(begin, end, encoded.frequencies.data.get()); begin = end; - end = begin + nhit; + assert(encoded.hit.size <= nhit); + end = begin + encoded.hit.size; std::copy(begin, end, encoded.hit.data.get()); - { - unsigned char *const p = encoded.hit.data.get(); - std::fill(p + nhit, p + nbytes, 0); - } - - begin = end; + // Skip any bytes between `begin + encoded.hit.size` and `begin + nhit`. + begin = end + nhit - encoded.hit.size; end = begin + nmissed; std::copy(begin, end, encoded.missed.data.get()); } diff --git a/src/huffman.cpp b/src/huffman.cpp index ea5a23bf68..7dc1d77fbe 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -10,18 +10,21 @@ #include #include -#include - #include "huffman.hpp" namespace mgard { HuffmanEncodedStream::HuffmanEncodedStream(const std::size_t nbits, - const std::size_t ncompressed, const std::size_t nmissed, - const std::size_t nfrequencies) - : nbits(nbits), hit(ncompressed), missed(nmissed), - frequencies(nfrequencies) {} + const std::size_t ntable) + : nbits(nbits), hit(sizeof(unsigned int) * + ((nbits + CHAR_BIT * sizeof(unsigned int) - 1) / + (CHAR_BIT * sizeof(unsigned int)))), + missed(nmissed), frequencies(ntable) { + unsigned char *const p = hit.data.get(); + // Zero out the bits/bytes we won't write to. + std::fill(p + (nbits + CHAR_BIT - 1) / CHAR_BIT, p + hit.size, 0); +} void HuffmanCodeword::push_back(const bool bit) { const unsigned char offset = length % CHAR_BIT; @@ -114,19 +117,11 @@ HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, const std::size_t nbits = std::inner_product(code.frequencies.begin(), code.frequencies.end(), lengths.begin(), static_cast(0)); - const std::size_t nbytes = - sizeof(unsigned int) * ((nbits + CHAR_BIT * sizeof(unsigned int) - 1) / - (CHAR_BIT * sizeof(unsigned int))); - if (nbytes % sizeof(unsigned int)) { - throw std::runtime_error( - "`nbytes` not bumped up to nearest multiple of `unsigned int` size"); - } - const std::size_t nnz = code.ncodewords - std::count(code.frequencies.begin(), code.frequencies.end(), 0); - HuffmanEncodedStream out(nbits, nbytes, code.nmissed() * sizeof(MissedSymbol), + HuffmanEncodedStream out(nbits, code.nmissed() * sizeof(MissedSymbol), 2 * nnz * sizeof(std::size_t)); // Write frequency table. @@ -184,7 +179,7 @@ HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, } } - endianness_shuffle(buffer, nbytes); + endianness_shuffle(buffer, out.hit.size); return out; } diff --git a/tests/src/compressors_regression.cpp b/tests/src/compressors_regression.cpp index c65bac9cd1..7f7a3ecdf1 100644 --- a/tests/src/compressors_regression.cpp +++ b/tests/src/compressors_regression.cpp @@ -119,7 +119,7 @@ void decompress_memory_huffman(unsigned char const *const src, #endif } - HuffmanEncodedStream encoded(nbits, nhit, nmissed, nfrequencies); + HuffmanEncodedStream encoded(nbits, nmissed, nfrequencies); { unsigned char const *begin; unsigned char const *end; @@ -129,10 +129,12 @@ void decompress_memory_huffman(unsigned char const *const src, std::copy(begin, end, encoded.frequencies.data.get()); begin = end; - end = begin + nhit; + assert(encoded.hit.size <= nhit); + end = begin + encoded.hit.size; std::copy(begin, end, encoded.hit.data.get()); - begin = end; + // Skip any bytes between `begin + encoded.hit.size` and `begin + nhit`. + begin = end + nhit - encoded.hit.size; end = begin + nmissed; std::copy(begin, end, encoded.missed.data.get()); } diff --git a/tests/src/huffman_regression.cpp b/tests/src/huffman_regression.cpp index c2c58bdc95..6d8739dbde 100644 --- a/tests/src/huffman_regression.cpp +++ b/tests/src/huffman_regression.cpp @@ -215,15 +215,12 @@ HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, nnz += frequency ? 1 : 0; } - const std::size_t nbytes = - sizeof(unsigned int) * ((nbits + CHAR_BIT * sizeof(unsigned int) - 1) / - (CHAR_BIT * sizeof(unsigned int))); - HuffmanEncodedStream out(nbits, nbytes, num_miss * sizeof(int), + HuffmanEncodedStream out(nbits, num_miss * sizeof(int), 2 * nnz * sizeof(std::size_t)); unsigned int *const hit = reinterpret_cast(out.hit.data.get()); - std::fill(hit, hit + nbytes / sizeof(unsigned int), 0u); + std::fill(hit, hit + out.hit.size / sizeof(unsigned int), 0u); int *missed = reinterpret_cast(out.missed.data.get()); From 25e639de39e4891701bcae03e40c0a945590f8dd Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 15 Jun 2022 11:31:53 -0400 Subject: [PATCH 42/58] Add `HuffmanEncodedStream` {,de}serializer. --- include/huffman.hpp | 24 +++++ src/compressors.cpp | 109 +------------------ src/huffman.cpp | 155 +++++++++++++++++++++++---- tests/src/compressors_regression.cpp | 3 +- tests/src/test_compressors.cpp | 5 +- tests/src/test_format.cpp | 2 + tests/src/test_huffman.cpp | 47 ++++++++ 7 files changed, 213 insertions(+), 132 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 97300f91ef..6a746deead 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -43,6 +43,30 @@ struct HuffmanEncodedStream { MemoryBuffer frequencies; }; +//! Serialize a Huffman-encoded stream and then compress. +//! +//!\deprecated +//! +//! The serialized stream will be compressed with ZSTD if `MGARD_ZSTD` is +//! defined and with `zlib` otherwise. +//! +//!\param encoded Huffman-encoded stream to serialize and compress. +MemoryBuffer +serialize_compress(const HuffmanEncodedStream &encoded); + +//! Decompress and then deserialize a Huffman-encoded stream. +//! +//!\deprecated +//! +//! The buffer will be decompressed with ZSTD if `MGARD_ZSTD` if defined and +//! with `zlib` otherwise. +//! +//!\param src Buffer containing serialized and compressed Huffman-encoded +//! stream. +//!\param srcLen Size in bytes of the buffer. +HuffmanEncodedStream decompress_deserialize(unsigned char const *const src, + const std::size_t srcLen); + //! Codeword (in progress) associated to a node in a Huffman code creation tree. struct HuffmanCodeword { //! Bytes containing the bits of the codeword. diff --git a/src/compressors.cpp b/src/compressors.cpp index f9ff0254d6..85e2930512 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -22,58 +22,10 @@ namespace mgard { -namespace { - -std::size_t hit_buffer_size(const std::size_t nbits) { - return nbits / CHAR_BIT + sizeof(unsigned int); -} - -} // namespace - void decompress_memory_huffman(unsigned char const *const src, const std::size_t srcLen, long int *const dst, const std::size_t dstLen) { - std::size_t const *const sizes = reinterpret_cast(src); - const std::size_t nfrequencies = sizes[0]; - const std::size_t nbits = sizes[1]; - const std::size_t nmissed = sizes[2]; - const std::size_t nhit = hit_buffer_size(nbits); - - MemoryBuffer buffer(nfrequencies + nhit + nmissed); - { - const std::size_t offset = 3 * sizeof(std::size_t); - unsigned char const *const src_ = src + offset; - const std::size_t srcLen_ = srcLen - offset; - unsigned char *const dst_ = buffer.data.get(); - const std::size_t dstLen_ = buffer.size; - -#ifndef MGARD_ZSTD - decompress_memory_z(src_, srcLen_, dst_, dstLen_); -#else - decompress_memory_zstd(src_, srcLen_, dst_, dstLen_); -#endif - } - - HuffmanEncodedStream encoded(nbits, nmissed, nfrequencies); - { - unsigned char const *begin; - unsigned char const *end; - - begin = buffer.data.get(); - end = begin + nfrequencies; - std::copy(begin, end, encoded.frequencies.data.get()); - - begin = end; - assert(encoded.hit.size <= nhit); - end = begin + encoded.hit.size; - std::copy(begin, end, encoded.hit.data.get()); - - // Skip any bytes between `begin + encoded.hit.size` and `begin + nhit`. - begin = end + nhit - encoded.hit.size; - end = begin + nmissed; - std::copy(begin, end, encoded.missed.data.get()); - } - + const HuffmanEncodedStream encoded = decompress_deserialize(src, srcLen); const MemoryBuffer decoded = huffman_decoding(encoded); { long int const *const p = decoded.data.get(); @@ -85,67 +37,10 @@ void decompress_memory_huffman(unsigned char const *const src, } } -namespace { - -using Constituent = std::pair; - -MemoryBuffer -gather_constituents(const std::vector &constituents) { - std::size_t nbuffer = 0; - for (const Constituent &constituent : constituents) { - nbuffer += constituent.second; - } - MemoryBuffer buffer(nbuffer); - unsigned char *p = buffer.data.get(); - for (const Constituent &constituent : constituents) { - std::memcpy(p, constituent.first, constituent.second); - p += constituent.second; - } - return buffer; -} - -} // namespace - MemoryBuffer compress_memory_huffman(long int const *const src, const std::size_t srcLen) { const HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); - - assert(not(encoded.hit.size % sizeof(unsigned int))); - - static_assert(CHAR_BIT == 8, "code written assuming `CHAR_BIT == 8`"); - static_assert(sizeof(unsigned int) == 4, - "code written assuming `sizeof(unsigned int) == 4`"); - const std::size_t offset = encoded.nbits % (CHAR_BIT * sizeof(unsigned int)); - // Number of hit buffer padding bytes. - const std::size_t nhbpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); - - assert(encoded.hit.size + nhbpb == hit_buffer_size(encoded.nbits)); - - unsigned char const *hbpb = new unsigned char[nhbpb](); - MemoryBuffer payload = gather_constituents({ - {encoded.frequencies.data.get(), encoded.frequencies.size}, - {encoded.hit.data.get(), encoded.hit.size}, - {hbpb, nhbpb}, - {encoded.missed.data.get(), encoded.missed.size}, - }); - delete[] hbpb; - -#ifndef MGARD_ZSTD - const MemoryBuffer out_data = - compress_memory_z(payload.data.get(), payload.size); -#else - const MemoryBuffer out_data = - compress_memory_zstd(payload.data.get(), payload.size); -#endif - - return gather_constituents( - {{reinterpret_cast(&encoded.frequencies.size), - sizeof(encoded.frequencies.size)}, - {reinterpret_cast(&encoded.nbits), - sizeof(encoded.nbits)}, - {reinterpret_cast(&encoded.missed.size), - sizeof(encoded.missed.size)}, - {out_data.data.get(), out_data.size}}); + return serialize_compress(encoded); } #ifdef MGARD_ZSTD diff --git a/src/huffman.cpp b/src/huffman.cpp index 7dc1d77fbe..369497cabc 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -8,8 +8,12 @@ #include #include #include +#include #include +#include + +#include "compressors.hpp" #include "huffman.hpp" namespace mgard { @@ -22,10 +26,136 @@ HuffmanEncodedStream::HuffmanEncodedStream(const std::size_t nbits, (CHAR_BIT * sizeof(unsigned int)))), missed(nmissed), frequencies(ntable) { unsigned char *const p = hit.data.get(); - // Zero out the bits/bytes we won't write to. + // Zero out the bytes we won't write to. If `nbits % CHAR_BIT`, there will + // still be bits in the final byte that aren't zeroed out. std::fill(p + (nbits + CHAR_BIT - 1) / CHAR_BIT, p + hit.size, 0); } +namespace { + +void check_type_sizes() { + static_assert(CHAR_BIT == 8, + "code written with assumption that `CHAR_BIT == 8`"); + static_assert( + sizeof(unsigned int) == 4, + "code written with assumption that `sizeof(unsigned int) == 4`"); + static_assert(sizeof(int) == 4, + "code written with assumption that `sizeof(int) == 4`"); + static_assert( + sizeof(std::size_t) == 8, + "code written with assumption that `sizeof(unsigned int) == 8`"); +} + +using Constituent = std::pair; + +MemoryBuffer +gather(const std::vector &constituents) { + std::size_t nbuffer = 0; + for (const Constituent &constituent : constituents) { + nbuffer += constituent.second; + } + MemoryBuffer buffer(nbuffer); + unsigned char *p = buffer.data.get(); + for (const Constituent &constituent : constituents) { + std::memcpy(p, constituent.first, constituent.second); + p += constituent.second; + } + return buffer; +} + +} // namespace + +MemoryBuffer +serialize_compress(const HuffmanEncodedStream &encoded) { + check_type_sizes(); + + assert(not(encoded.hit.size % sizeof(unsigned int))); + + const std::size_t offset = encoded.nbits % (CHAR_BIT * sizeof(unsigned int)); + // Number of hit buffer padding bytes. + const std::size_t nhbpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); + + // The righthand side is how the size in bytes of the padded hit buffer was + // originally calculated. + assert(encoded.hit.size + nhbpb == + encoded.nbits / CHAR_BIT + sizeof(unsigned int)); + + unsigned char const *hbpb = new unsigned char[nhbpb](); + MemoryBuffer payload = gather({ + {encoded.frequencies.data.get(), encoded.frequencies.size}, + {encoded.hit.data.get(), encoded.hit.size}, + {hbpb, nhbpb}, + {encoded.missed.data.get(), encoded.missed.size}, + }); + delete[] hbpb; + +#ifndef MGARD_ZSTD + const MemoryBuffer out_data = compress_memory_z( + const_cast(payload.data.get()), payload.size); +#else + const MemoryBuffer out_data = + compress_memory_zstd(payload.data.get(), payload.size); +#endif + + return gather( + {{reinterpret_cast(&encoded.frequencies.size), + sizeof(encoded.frequencies.size)}, + {reinterpret_cast(&encoded.nbits), + sizeof(encoded.nbits)}, + {reinterpret_cast(&encoded.missed.size), + sizeof(encoded.missed.size)}, + {out_data.data.get(), out_data.size}}); +} + +HuffmanEncodedStream decompress_deserialize(unsigned char const *const src, + const std::size_t srcLen) { + std::size_t const *const sizes = reinterpret_cast(src); + const std::size_t nfrequencies = sizes[0]; + const std::size_t nbits = sizes[1]; + const std::size_t nmissed = sizes[2]; + // This is how the size in bytes of the padded hit buffer was calculated + // in `decompress_memory_huffman` before this function was introduced. + const std::size_t nhit = nbits / CHAR_BIT + sizeof(unsigned int); + + MemoryBuffer buffer(nfrequencies + nhit + nmissed); + { + const std::size_t offset = 3 * sizeof(std::size_t); + unsigned char const *const src_ = src + offset; + const std::size_t srcLen_ = srcLen - offset; + unsigned char *const dst_ = buffer.data.get(); + const std::size_t dstLen_ = buffer.size; + +#ifndef MGARD_ZSTD + decompress_memory_z(const_cast(src_), srcLen_, + dst_, dstLen_); +#else + decompress_memory_zstd(src_, srcLen_, dst_, dstLen_); +#endif + } + + HuffmanEncodedStream encoded(nbits, nmissed, nfrequencies); + { + unsigned char const *begin; + unsigned char const *end; + + begin = buffer.data.get(); + end = begin + nfrequencies; + std::copy(begin, end, encoded.frequencies.data.get()); + + begin = end; + assert(encoded.hit.size <= nhit); + end = begin + encoded.hit.size; + std::copy(begin, end, encoded.hit.data.get()); + + // Skip any bytes between `begin + encoded.hit.size` and `begin + nhit`. + begin = end + nhit - encoded.hit.size; + end = begin + nmissed; + std::copy(begin, end, encoded.missed.data.get()); + } + + return encoded; +} + void HuffmanCodeword::push_back(const bool bit) { const unsigned char offset = length % CHAR_BIT; if (not offset) { @@ -76,29 +206,10 @@ void endianness_shuffle(unsigned char *const buffer, const std::size_t nbytes) { } } -} // namespace -namespace { - -void check_type_sizes() { - static_assert(CHAR_BIT == 8, - "code written with assumption that `CHAR_BIT == 8`"); - static_assert( - sizeof(unsigned int) == 4, - "code written with assumption that `sizeof(unsigned int) == 4`"); - static_assert(sizeof(int) == 4, - "code written with assumption that `sizeof(int) == 4`"); - static_assert( - sizeof(std::size_t) == 8, - "code written with assumption that `sizeof(unsigned int) == 8`"); -} - -} // namespace - -namespace { - const std::pair nql_endpoints{ -static_cast((nql - 1) / 2), nql / 2 - 1}; -} + +} // namespace HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, const std::size_t n) { diff --git a/tests/src/compressors_regression.cpp b/tests/src/compressors_regression.cpp index 7f7a3ecdf1..4284f31b31 100644 --- a/tests/src/compressors_regression.cpp +++ b/tests/src/compressors_regression.cpp @@ -113,7 +113,8 @@ void decompress_memory_huffman(unsigned char const *const src, const std::size_t dstLen_ = buffer.size; #ifndef MGARD_ZSTD - decompress_memory_z(src_, srcLen_, dst_, dstLen_); + decompress_memory_z(const_cast(src_), srcLen_, + dst_, dstLen_); #else decompress_memory_zstd(src_, srcLen_, dst_, dstLen_); #endif diff --git a/tests/src/test_compressors.cpp b/tests/src/test_compressors.cpp index 501f085039..3f5568dda7 100644 --- a/tests/src/test_compressors.cpp +++ b/tests/src/test_compressors.cpp @@ -172,6 +172,7 @@ TEST_CASE("Huffman compression", "[compressors] [!mayfail]") { SECTION("long integers") { test_huffman_identity(gen, n); } } +#ifdef MGARD_ZSTD namespace { void test_zstd_identity(std::uniform_int_distribution &dis, @@ -192,7 +193,6 @@ void test_zstd_identity(std::uniform_int_distribution &dis, } // namespace -#ifdef MGARD_ZSTD TEST_CASE("zstd compression", "[compressors]") { std::uniform_int_distribution dis; std::default_random_engine gen(158648); @@ -262,7 +262,8 @@ TEST_CASE("compression with header configuration", "[compressors]") { compressed.data.get(), compressed.size, dst, quantizedLen); #else REQUIRE(e.compressor() == mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); - mgard::decompress_memory_z(compressed.data.get(), compressed.size, dst, + mgard::decompress_memory_z(compressed.data.get(), compressed.size, + reinterpret_cast(dst), quantizedLen); #endif REQUIRE(std::equal(quantized, quantized + ndof, dst)); diff --git a/tests/src/test_format.cpp b/tests/src/test_format.cpp index 970eb87fc3..47055e6ac1 100644 --- a/tests/src/test_format.cpp +++ b/tests/src/test_format.cpp @@ -350,11 +350,13 @@ TEST_CASE("reading encoding compressor", "[format]") { e.set_compressor(mgard::pb::Encoding::X_HUFFMAN_LZ4); REQUIRE_THROWS(mgard::read_encoding_compressor(header)); } +#ifdef MGARD_ZSTD { e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); REQUIRE(mgard::read_encoding_compressor(header) == mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); } +#endif } namespace { diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index 0921f4538b..cc6da1523e 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include "testing_utilities.hpp" @@ -215,3 +216,49 @@ TEMPLATE_TEST_CASE("Huffman inversion", "[huffman]", std::int8_t, std::int16_t, test_inversion_random(10000, -100, 100, gen); } } + +TEST_CASE("`HuffmanEncodedStream` serialization inversion", "[huffman]") { + // This is not intended to be a valid `HuffmanEncodedStream`. + const std::size_t nbits = 2718; + const std::size_t nmissed = 896 * sizeof(int); + const std::size_t ntable = 681 * 2 * sizeof(std::size_t); + const mgard::HuffmanEncodedStream original(nbits, nmissed, ntable); + { + unsigned char *const p = original.hit.data.get(); + std::iota(p, p + original.hit.size, 1u); + } + { + unsigned char *const p = original.missed.data.get(); + std::iota(p, p + nmissed, 90u); + } + { + unsigned char *const p = original.frequencies.data.get(); + std::iota(p, p + ntable, 51u); + } + + const mgard::MemoryBuffer serialized = + mgard::serialize_compress(original); + const mgard::HuffmanEncodedStream deserialized = + mgard::decompress_deserialize(serialized.data.get(), serialized.size); + + REQUIRE(original.nbits == deserialized.nbits); + REQUIRE(original.hit.size == deserialized.hit.size); + REQUIRE(original.missed.size == deserialized.missed.size); + REQUIRE(original.frequencies.size == deserialized.frequencies.size); + + { + unsigned char const *const p = original.hit.data.get(); + unsigned char const *const q = deserialized.hit.data.get(); + REQUIRE(std::equal(p, p + original.hit.size, q)); + } + { + unsigned char const *const p = original.missed.data.get(); + unsigned char const *const q = deserialized.missed.data.get(); + REQUIRE(std::equal(p, p + nmissed, q)); + } + { + unsigned char const *const p = original.frequencies.data.get(); + unsigned char const *const q = deserialized.frequencies.data.get(); + REQUIRE(std::equal(p, p + ntable, q)); + } +} From a05703d3e4557e8194ad97ec60e68b6a57669563 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Wed, 15 Jun 2022 14:51:10 -0400 Subject: [PATCH 43/58] Select serialization compressor at runtime. --- include/huffman.hpp | 15 +++++---- src/compressors.cpp | 23 ++++++++++++-- src/huffman.cpp | 65 +++++++++++++++++++++++++++----------- tests/src/test_huffman.cpp | 25 +++++++++++++-- 4 files changed, 99 insertions(+), 29 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 6a746deead..e86841df57 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -10,6 +10,7 @@ #include #include +#include "format.hpp" #include "utilities.hpp" namespace mgard { @@ -47,24 +48,26 @@ struct HuffmanEncodedStream { //! //!\deprecated //! -//! The serialized stream will be compressed with ZSTD if `MGARD_ZSTD` is -//! defined and with `zlib` otherwise. +//! The header will determine which compressor is used. //! +//!\param header Header for the self-describing buffer. //!\param encoded Huffman-encoded stream to serialize and compress. MemoryBuffer -serialize_compress(const HuffmanEncodedStream &encoded); +serialize_compress(const pb::Header &header, + const HuffmanEncodedStream &encoded); //! Decompress and then deserialize a Huffman-encoded stream. //! //!\deprecated //! -//! The buffer will be decompressed with ZSTD if `MGARD_ZSTD` if defined and -//! with `zlib` otherwise. +//! The header will determine which decompressor is used. //! +//!\param header Header of the self-describing buffer. //!\param src Buffer containing serialized and compressed Huffman-encoded //! stream. //!\param srcLen Size in bytes of the buffer. -HuffmanEncodedStream decompress_deserialize(unsigned char const *const src, +HuffmanEncodedStream decompress_deserialize(const pb::Header &header, + unsigned char const *const src, const std::size_t srcLen); //! Codeword (in progress) associated to a node in a Huffman code creation tree. diff --git a/src/compressors.cpp b/src/compressors.cpp index 85e2930512..35d16a52c2 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -25,7 +25,17 @@ namespace mgard { void decompress_memory_huffman(unsigned char const *const src, const std::size_t srcLen, long int *const dst, const std::size_t dstLen) { - const HuffmanEncodedStream encoded = decompress_deserialize(src, srcLen); + // Dummy header until we change the signature of `decompress_memory_huffman`. + pb::Header header; + header.mutable_encoding()->set_compressor( +#ifdef MGARD_ZSTD + pb::Encoding::CPU_HUFFMAN_ZSTD +#else + pb::Encoding::CPU_HUFFMAN_ZLIB +#endif + ); + const HuffmanEncodedStream encoded = + decompress_deserialize(header, src, srcLen); const MemoryBuffer decoded = huffman_decoding(encoded); { long int const *const p = decoded.data.get(); @@ -40,7 +50,16 @@ void decompress_memory_huffman(unsigned char const *const src, MemoryBuffer compress_memory_huffman(long int const *const src, const std::size_t srcLen) { const HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); - return serialize_compress(encoded); + // Dummy header until we change the signature of `compress_memory_huffman`. + pb::Header header; + header.mutable_encoding()->set_compressor( +#ifdef MGARD_ZSTD + pb::Encoding::CPU_HUFFMAN_ZSTD +#else + pb::Encoding::CPU_HUFFMAN_ZLIB +#endif + ); + return serialize_compress(header, encoded); } #ifdef MGARD_ZSTD diff --git a/src/huffman.cpp b/src/huffman.cpp index 369497cabc..1ebb2fbded 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -11,8 +11,6 @@ #include #include -#include - #include "compressors.hpp" #include "huffman.hpp" @@ -63,13 +61,35 @@ gather(const std::vector &constituents) { return buffer; } +MemoryBuffer +compress_serialized_huffman(const pb::Header &header, + const MemoryBuffer &payload) { + switch (header.encoding().compressor()) { + case pb::Encoding::CPU_HUFFMAN_ZLIB: + return compress_memory_z( + const_cast(payload.data.get()), payload.size); + case pb::Encoding::CPU_HUFFMAN_ZSTD: +#ifdef MGARD_ZSTD + return compress_memory_zstd(payload.data.get(), payload.size); +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif + default: + throw std::runtime_error("unrecognized lossless compressor"); + } +} + } // namespace MemoryBuffer -serialize_compress(const HuffmanEncodedStream &encoded) { +serialize_compress(const pb::Header &header, + const HuffmanEncodedStream &encoded) { check_type_sizes(); - assert(not(encoded.hit.size % sizeof(unsigned int))); + if (header.encoding().serialization() != pb::Encoding::DEPRECATED) { + throw std::runtime_error( + "Huffman tree not to be serialized with deprecated method"); + } const std::size_t offset = encoded.nbits % (CHAR_BIT * sizeof(unsigned int)); // Number of hit buffer padding bytes. @@ -89,14 +109,8 @@ serialize_compress(const HuffmanEncodedStream &encoded) { }); delete[] hbpb; -#ifndef MGARD_ZSTD - const MemoryBuffer out_data = compress_memory_z( - const_cast(payload.data.get()), payload.size); -#else - const MemoryBuffer out_data = - compress_memory_zstd(payload.data.get(), payload.size); -#endif - + const MemoryBuffer compressed = + compress_serialized_huffman(header, payload); return gather( {{reinterpret_cast(&encoded.frequencies.size), sizeof(encoded.frequencies.size)}, @@ -104,11 +118,17 @@ serialize_compress(const HuffmanEncodedStream &encoded) { sizeof(encoded.nbits)}, {reinterpret_cast(&encoded.missed.size), sizeof(encoded.missed.size)}, - {out_data.data.get(), out_data.size}}); + {compressed.data.get(), compressed.size}}); } -HuffmanEncodedStream decompress_deserialize(unsigned char const *const src, +HuffmanEncodedStream decompress_deserialize(const pb::Header &header, + unsigned char const *const src, const std::size_t srcLen) { + if (header.encoding().serialization() != pb::Encoding::DEPRECATED) { + throw std::runtime_error( + "Huffman tree not serialized with deprecated method"); + } + std::size_t const *const sizes = reinterpret_cast(src); const std::size_t nfrequencies = sizes[0]; const std::size_t nbits = sizes[1]; @@ -125,12 +145,21 @@ HuffmanEncodedStream decompress_deserialize(unsigned char const *const src, unsigned char *const dst_ = buffer.data.get(); const std::size_t dstLen_ = buffer.size; -#ifndef MGARD_ZSTD - decompress_memory_z(const_cast(src_), srcLen_, - dst_, dstLen_); + switch (header.encoding().compressor()) { + case pb::Encoding::CPU_HUFFMAN_ZLIB: + decompress_memory_z(const_cast(src_), srcLen_, + dst_, dstLen_); + break; + case pb::Encoding::CPU_HUFFMAN_ZSTD: +#ifdef MGARD_ZSTD + decompress_memory_zstd(src_, srcLen_, dst_, dstLen_); + break; #else - decompress_memory_zstd(src_, srcLen_, dst_, dstLen_); + throw std::runtime_error("MGARD compiled without ZSTD support"); #endif + default: + throw std::runtime_error("unrecognized lossless compressor"); + } } HuffmanEncodedStream encoded(nbits, nmissed, nfrequencies); diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index cc6da1523e..a444d0c6c1 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -10,6 +10,7 @@ #include "testing_utilities.hpp" +#include "format.hpp" #include "huffman.hpp" #include "huffman_regression.hpp" @@ -217,7 +218,15 @@ TEMPLATE_TEST_CASE("Huffman inversion", "[huffman]", std::int8_t, std::int16_t, } } -TEST_CASE("`HuffmanEncodedStream` serialization inversion", "[huffman]") { +namespace { + +void test_hes_serialization_inversion( + const mgard::pb::Encoding::Compressor compressor) { + mgard::pb::Header header; + mgard::pb::Encoding &encoding = *header.mutable_encoding(); + encoding.set_compressor(compressor); + encoding.set_serialization(mgard::pb::Encoding::DEPRECATED); + // This is not intended to be a valid `HuffmanEncodedStream`. const std::size_t nbits = 2718; const std::size_t nmissed = 896 * sizeof(int); @@ -237,9 +246,10 @@ TEST_CASE("`HuffmanEncodedStream` serialization inversion", "[huffman]") { } const mgard::MemoryBuffer serialized = - mgard::serialize_compress(original); + mgard::serialize_compress(header, original); const mgard::HuffmanEncodedStream deserialized = - mgard::decompress_deserialize(serialized.data.get(), serialized.size); + mgard::decompress_deserialize(header, serialized.data.get(), + serialized.size); REQUIRE(original.nbits == deserialized.nbits); REQUIRE(original.hit.size == deserialized.hit.size); @@ -262,3 +272,12 @@ TEST_CASE("`HuffmanEncodedStream` serialization inversion", "[huffman]") { REQUIRE(std::equal(p, p + ntable, q)); } } + +} // namespace + +TEST_CASE("`HuffmanEncodedStream` serialization inversion", "[huffman]") { + test_hes_serialization_inversion(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); +#ifdef MGARD_ZSTD + test_hes_serialization_inversion(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); +#endif +} From 3e53ce41e824c7bde59a0c3bd5abc761198ca805 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 14 Jun 2022 13:38:52 -0400 Subject: [PATCH 44/58] Enable `RFMH` in `{,de}compress`. --- CMakeLists.txt | 4 +- include/compress.tpp | 2 - include/compressors.hpp | 21 -- include/huffman.hpp | 2 + src/compressors.cpp | 441 +++++++++++++++++++++++++++++++++------- src/format.cpp | 1 + src/mgard.proto | 6 +- 7 files changed, 374 insertions(+), 103 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a2902e6db..c92cca399a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,11 +11,11 @@ endif() list(INSERT CMAKE_MODULE_PATH 0 "${CMAKE_CURRENT_LIST_DIR}/cmake") set(MGARD_VERSION_MAJOR "1") -set(MGARD_VERSION_MINOR "2") +set(MGARD_VERSION_MINOR "3") set(MGARD_VERSION_PATCH "0") set(MGARD_FILE_VERSION_MAJOR "1") -set(MGARD_FILE_VERSION_MINOR "0") +set(MGARD_FILE_VERSION_MINOR "1") set(MGARD_FILE_VERSION_PATCH "0") project( diff --git a/include/compress.tpp b/include/compress.tpp index ebd9a76e83..867b6cfa9f 100644 --- a/include/compress.tpp +++ b/include/compress.tpp @@ -28,8 +28,6 @@ namespace mgard { -using DEFAULT_INT_T = std::int64_t; - template CompressedDataset compress(const TensorMeshHierarchy &hierarchy, Real *const v, diff --git a/include/compressors.hpp b/include/compressors.hpp index 1542d3eeb2..c946538b50 100644 --- a/include/compressors.hpp +++ b/include/compressors.hpp @@ -16,27 +16,6 @@ namespace mgard { -//! Compress an array using a Huffman tree. -//! -//!\deprecated -//! -//!\param[in] src Array to be compressed. -//!\param[in] srcLen Size of array (number of elements) to be compressed. -MemoryBuffer compress_memory_huffman(long int const *const src, - const std::size_t srcLen); - -//! Decompress an array compressed with `compress_memory_huffman`. -//! -//!\deprecated -//! -//!\param[in] src Compressed array. -//!\param[in] srcLen Size in bytes of the compressed array. -//!\param[out] dst Decompressed array. -//!\param[in] dstLen Size in bytes of the decompressed array. -void decompress_memory_huffman(unsigned char const *const src, - const std::size_t srcLen, long int *const dst, - const std::size_t dstLen); - #ifdef MGARD_ZSTD //! Compress an array using `zstd`. //! diff --git a/include/huffman.hpp b/include/huffman.hpp index e86841df57..7233f6e6a7 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -22,6 +22,8 @@ namespace mgard { inline constexpr std::size_t nql = 1 << 17; //! A stream compressed using a Huffman code. +//! +//!\deprecated struct HuffmanEncodedStream { //! Constructor. //! diff --git a/src/compressors.cpp b/src/compressors.cpp index 35d16a52c2..a67279292f 100644 --- a/src/compressors.cpp +++ b/src/compressors.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -22,46 +23,6 @@ namespace mgard { -void decompress_memory_huffman(unsigned char const *const src, - const std::size_t srcLen, long int *const dst, - const std::size_t dstLen) { - // Dummy header until we change the signature of `decompress_memory_huffman`. - pb::Header header; - header.mutable_encoding()->set_compressor( -#ifdef MGARD_ZSTD - pb::Encoding::CPU_HUFFMAN_ZSTD -#else - pb::Encoding::CPU_HUFFMAN_ZLIB -#endif - ); - const HuffmanEncodedStream encoded = - decompress_deserialize(header, src, srcLen); - const MemoryBuffer decoded = huffman_decoding(encoded); - { - long int const *const p = decoded.data.get(); - if (decoded.size * sizeof(*p) != dstLen) { - throw std::runtime_error( - "mismatch between expected and obtained decompressed buffer sizes"); - } - std::copy(p, p + decoded.size, dst); - } -} - -MemoryBuffer compress_memory_huffman(long int const *const src, - const std::size_t srcLen) { - const HuffmanEncodedStream encoded = huffman_encoding(src, srcLen); - // Dummy header until we change the signature of `compress_memory_huffman`. - pb::Header header; - header.mutable_encoding()->set_compressor( -#ifdef MGARD_ZSTD - pb::Encoding::CPU_HUFFMAN_ZSTD -#else - pb::Encoding::CPU_HUFFMAN_ZLIB -#endif - ); - return serialize_compress(header, encoded); -} - #ifdef MGARD_ZSTD /*! CHECK * Check that the condition holds. If it doesn't print a message and die. @@ -76,10 +37,6 @@ MemoryBuffer compress_memory_huffman(long int const *const src, } \ } while (0) -/*! CHECK_ZSTD - * Check the zstd error code and die if an error occurred after printing a - * message. - */ /*! CHECK_ZSTD * Check the zstd error code and die if an error occurred after printing a * message. @@ -191,58 +148,388 @@ void decompress_memory_zstd(void const *const src, const std::size_t srcLen, } #endif +namespace { + +template +MemoryBuffer compress_huffman_C_rfmh_(const pb::Header &header, + void const *const src, + const std::size_t srcLen) { + check_quantization_buffer(header, src, srcLen); + + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + return huffman_encode(static_cast(src), srcLen / sizeof(Int)); +} + +// `C` being either ZSTD or `zlib`. +MemoryBuffer compress_huffman_C_rfmh(const pb::Header &header, + void *const src, + const std::size_t srcLen) { + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + switch (header.quantization().type()) { + case pb::Quantization::INT8_T: + return compress_huffman_C_rfmh_(header, src, srcLen); + case pb::Quantization::INT16_T: + return compress_huffman_C_rfmh_(header, src, srcLen); + case pb::Quantization::INT32_T: + return compress_huffman_C_rfmh_(header, src, srcLen); + case pb::Quantization::INT64_T: + return compress_huffman_C_rfmh_(header, src, srcLen); + default: + throw std::runtime_error("unrecognized quantization type"); + } +} + +MemoryBuffer +compress_huffman_C_deprecated(const pb::Header &header, void *const src, + const std::size_t srcLen) { + check_quantization_buffer(header, src, srcLen); + + assert(header.encoding().serialization() == pb::Encoding::DEPRECATED); + if (header.quantization().type() != mgard::pb::Quantization::INT64_T) { + throw std::runtime_error( + "deprecated Huffman coding not implemented for quantization " + "types other than `std::int64_t`"); + } + // I don't think it's strictly necessary that `std::int64_t` and `long int` + // are the same type. We could think of `long int` as a generic byte type, + // like `unsigned char`. Worth more attention if this assertion ever fails, + // though. That might be a good time to remove the deprecated Huffman coding + // functions. + static_assert(std::is_same::value, + "deprecated Huffman coding written with assumption that " + "`std::int64_t` is `long int`"); + + return serialize_compress( + header, huffman_encoding(reinterpret_cast(src), + srcLen / sizeof(long int))); +} + +MemoryBuffer +compress_huffman_zlib_deprecated(const pb::Header &header, void *const src, + const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + + return compress_huffman_C_deprecated(header, src, srcLen); +} + +#ifdef MGARD_ZSTD +MemoryBuffer +compress_huffman_zstd_deprecated(const pb::Header &header, void *const src, + const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + + return compress_huffman_C_deprecated(header, src, srcLen); +} +#endif + +namespace { + +// `decompress_memory_z` and `decompress_memory_zstd` need to know the size of +// the decompressed buffer before they can decompress. So, in addition to the +// compressed serialized Huffman tree (`compressed`), we need to store the size +// in bytes of the serialized Huffman tree (`nhuffman`). +MemoryBuffer concatenate_nhuffman_and_compressed( + const std::size_t nhuffman, const MemoryBuffer &compressed) { + MemoryBuffer out(HEADER_SIZE_SIZE + compressed.size); + unsigned char *p = out.data.get(); + + // Size in bytes of the serialized Huffman tree. + const std::array nhuffman_ = + serialize_header_size(nhuffman); + std::copy(nhuffman_.begin(), nhuffman_.end(), p); + p += HEADER_SIZE_SIZE; + + unsigned char const *const q = compressed.data.get(); + std::copy(q, q + compressed.size, p); + return out; +} + +} // namespace + +MemoryBuffer +compress_huffman_zlib_rfmh(const pb::Header &header, void *const src, + const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + const MemoryBuffer encoded = + compress_huffman_C_rfmh(header, src, srcLen); + const MemoryBuffer compressed = + compress_memory_z(encoded.data.get(), encoded.size); + return concatenate_nhuffman_and_compressed(encoded.size, compressed); +} + +#ifdef MGARD_ZSTD +MemoryBuffer +compress_huffman_zstd_rfmh(const pb::Header &header, void *const src, + const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + const MemoryBuffer encoded = + compress_huffman_C_rfmh(header, src, srcLen); + return concatenate_nhuffman_and_compressed( + encoded.size, compress_memory_zstd(encoded.data.get(), encoded.size)); +} +#endif + +MemoryBuffer compress_huffman_zlib(const pb::Header &header, + void *const src, + const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + + switch (header.encoding().serialization()) { + case pb::Encoding::DEPRECATED: + return compress_huffman_zlib_deprecated(header, src, srcLen); + case pb::Encoding::RFMH: + return compress_huffman_zlib_rfmh(header, src, srcLen); + default: + throw std::runtime_error("unrecognized Huffman serialization"); + } +} + +#ifdef MGARD_ZSTD +MemoryBuffer compress_huffman_zstd(const pb::Header &header, + void *const src, + const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + + switch (header.encoding().serialization()) { + case pb::Encoding::DEPRECATED: + return compress_huffman_zstd_deprecated(header, src, srcLen); + case pb::Encoding::RFMH: + return compress_huffman_zstd_rfmh(header, src, srcLen); + default: + throw std::runtime_error("unrecognized Huffman serialization"); + } +} +#endif + +} // namespace + MemoryBuffer compress(const pb::Header &header, void *const src, const std::size_t srcLen) { switch (header.encoding().compressor()) { - case pb::Encoding::CPU_HUFFMAN_ZSTD: + case pb::Encoding::CPU_ZLIB: + return compress_memory_z(src, srcLen); + case pb::Encoding::CPU_ZSTD: #ifdef MGARD_ZSTD - { - if (header.quantization().type() != mgard::pb::Quantization::INT64_T) { - throw std::runtime_error("Huffman tree not implemented for quantization " - "types other than `std::int64_t`"); - } - // Quantization type size. - const std::size_t qts = quantization_buffer(header, 1).size; - if (srcLen % qts) { - throw std::runtime_error("incorrect quantization buffer size"); - } - return compress_memory_huffman(reinterpret_cast(src), - srcLen / qts); - } + return compress_memory_zstd(src, srcLen); #else throw std::runtime_error("MGARD compiled without ZSTD support"); #endif case pb::Encoding::CPU_HUFFMAN_ZLIB: - return compress_memory_z(src, srcLen); + return compress_huffman_zlib(header, src, srcLen); + case pb::Encoding::CPU_HUFFMAN_ZSTD: +#ifdef MGARD_ZSTD + return compress_huffman_zstd(header, src, srcLen); +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif default: throw std::runtime_error("unrecognized lossless compressor"); } } +void decompress_noop(void *const src, const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + if (srcLen != dstLen) { + throw std::invalid_argument("source and destination lengths must be equal"); + } + { + unsigned char const *const p = static_cast(src); + unsigned char *const q = static_cast(dst); + std::copy(p, p + srcLen, q); + } +} + +namespace { + +template +void decompress_huffman_C_rfmh_(const pb::Header &header, + const MemoryBuffer &encoded, + void *const dst, const std::size_t dstLen) { + check_quantization_buffer(header, dst, dstLen); + + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + const MemoryBuffer decoded = huffman_decode(encoded); + if (sizeof(Int) * decoded.size != dstLen) { + throw std::runtime_error("size of destination buffer is incorrect"); + } + unsigned char const *const p = + reinterpret_cast(decoded.data.get()); + std::copy(p, p + dstLen, static_cast(dst)); +} + +// `C` being either ZSTD or `zlib`. +void decompress_huffman_C_rfmh(const pb::Header &header, + const MemoryBuffer &encoded, + void *const dst, const std::size_t dstLen) { + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + switch (header.quantization().type()) { + case pb::Quantization::INT8_T: + return decompress_huffman_C_rfmh_(header, encoded, dst, + dstLen); + case pb::Quantization::INT16_T: + return decompress_huffman_C_rfmh_(header, encoded, dst, + dstLen); + case pb::Quantization::INT32_T: + return decompress_huffman_C_rfmh_(header, encoded, dst, + dstLen); + case pb::Quantization::INT64_T: + return decompress_huffman_C_rfmh_(header, encoded, dst, + dstLen); + default: + throw std::runtime_error("unrecognized quantization type"); + } +} + +void decompress_huffman_C_deprecated(const pb::Header &header, void *const src, + const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + check_quantization_buffer(header, dst, dstLen); + + assert(header.encoding().serialization() == pb::Encoding::DEPRECATED); + if (header.quantization().type() != mgard::pb::Quantization::INT64_T) { + throw std::runtime_error( + "deprecated Huffman coding not implemented for quantization " + "types other than `std::int64_t`"); + } + // I don't think it's strictly necessary that `std::int64_t` and `long int` + // are the same type. We could think of `long int` as a generic byte type, + // like `unsigned char`. Worth more attention if this assertion ever fails, + // though. That might be a good time to remove the deprecated Huffman coding + // functions. + static_assert(std::is_same::value, + "deprecated Huffman coding written with assumption that " + "`std::int64_t` is `long int`"); + + const MemoryBuffer decoded = + huffman_decoding(decompress_deserialize( + header, reinterpret_cast(src), srcLen)); + if (sizeof(long int) * decoded.size != dstLen) { + throw std::runtime_error("size of destination buffer is incorrect"); + } + { + unsigned char const *const p = + reinterpret_cast(decoded.data.get()); + std::copy(p, p + dstLen, static_cast(dst)); + } +} + +void decompress_huffman_zlib_deprecated(const pb::Header &header, + void *const src, + const std::size_t srcLen, + void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + + return decompress_huffman_C_deprecated(header, src, srcLen, dst, dstLen); +} + +#ifdef MGARD_ZSTD +void decompress_huffman_zstd_deprecated(const pb::Header &header, + void *const src, + const std::size_t srcLen, + void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + + return decompress_huffman_C_deprecated(header, src, srcLen, dst, dstLen); +} +#endif + +void decompress_huffman_zlib_rfmh(const pb::Header &header, void *const src, + const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + BufferWindow window(src, srcLen); + // Read theSsze in bytes of the serialized Huffman tree. + MemoryBuffer encoded(read_header_size(window)); + decompress_memory_z(const_cast(window.current), + window.end - window.current, encoded.data.get(), + encoded.size); + + return decompress_huffman_C_rfmh(header, encoded, dst, dstLen); +} + +#ifdef MGARD_ZSTD +void decompress_huffman_zstd_rfmh(const pb::Header &header, void *const src, + const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + BufferWindow window(src, srcLen); + // Read the size in bytes of the serialized Huffman tree. + MemoryBuffer encoded(read_header_size(window)); + decompress_memory_zstd(const_cast(window.current), + window.end - window.current, encoded.data.get(), + encoded.size); + + return decompress_huffman_C_rfmh(header, encoded, dst, dstLen); +} +#endif + +void decompress_huffman_zlib(const pb::Header &header, void *const src, + const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + + switch (header.encoding().serialization()) { + case pb::Encoding::DEPRECATED: + return decompress_huffman_zlib_deprecated(header, src, srcLen, dst, dstLen); + case pb::Encoding::RFMH: + return decompress_huffman_zlib_rfmh(header, src, srcLen, dst, dstLen); + default: + throw std::runtime_error("unrecognized Huffman serialization"); + } +} + +#ifdef MGARD_ZSTD +void decompress_huffman_zstd(const pb::Header &header, void *const src, + const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + + switch (header.encoding().serialization()) { + case pb::Encoding::DEPRECATED: + return decompress_huffman_zstd_deprecated(header, src, srcLen, dst, dstLen); + case pb::Encoding::RFMH: + return decompress_huffman_zstd_rfmh(header, src, srcLen, dst, dstLen); + default: + throw std::runtime_error("unrecognized Huffman serialization"); + } +} +#endif + +} // namespace + void decompress(const pb::Header &header, void *const src, const std::size_t srcLen, void *const dst, const std::size_t dstLen) { - switch (read_encoding_compressor(header)) { - case pb::Encoding::NOOP_COMPRESSOR: - if (srcLen != dstLen) { - throw std::invalid_argument( - "source and destination lengths must be equal"); - } - { - unsigned char const *const p = static_cast(src); - unsigned char *const q = static_cast(dst); - std::copy(p, p + srcLen, q); - } - break; + switch (header.encoding().compressor()) { + case pb::Encoding::CPU_ZLIB: + return decompress_memory_z(const_cast(src), srcLen, + static_cast(dst), dstLen); + case pb::Encoding::CPU_ZSTD: +#ifdef MGARD_ZSTD + return decompress_memory_zstd( + src, srcLen, reinterpret_cast(dst), dstLen); +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif case pb::Encoding::CPU_HUFFMAN_ZLIB: - decompress_memory_z(const_cast(src), srcLen, - static_cast(dst), dstLen); - break; + return decompress_huffman_zlib(header, src, srcLen, dst, dstLen); case pb::Encoding::CPU_HUFFMAN_ZSTD: #ifdef MGARD_ZSTD - decompress_memory_huffman(static_cast(src), srcLen, - static_cast(dst), dstLen); - break; + return decompress_huffman_zstd(header, src, srcLen, dst, dstLen); #else throw std::runtime_error("MGARD compiled without ZSTD support"); #endif diff --git a/src/format.cpp b/src/format.cpp index 83b138db81..e9cda8e756 100644 --- a/src/format.cpp +++ b/src/format.cpp @@ -161,6 +161,7 @@ void populate_defaults(pb::Header &header) { pb::Encoding::CPU_HUFFMAN_ZLIB #endif ); + e.set_serialization(pb::Encoding::RFMH); } { pb::Device &device = *header.mutable_device(); diff --git a/src/mgard.proto b/src/mgard.proto index a96fd67c4f..2407a78f7e 100644 --- a/src/mgard.proto +++ b/src/mgard.proto @@ -125,7 +125,11 @@ message Encoding { } enum Compressor { NOOP_COMPRESSOR = 0; - CPU_HUFFMAN_ZLIB = 1; + // Explanation for the wonky numbering: this first case was originally called `CPU_HUFFMAN_ZLIB`, + // but the relevant code didn't actually call the Huffman encoder. + CPU_ZLIB = 1; + CPU_ZSTD = 7; + CPU_HUFFMAN_ZLIB = 6; CPU_HUFFMAN_ZSTD = 2; X_HUFFMAN = 3; X_HUFFMAN_LZ4 = 4; From d0fe35f0e383533ffd15bf07384674ce615395ae Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 16 Jun 2022 17:53:36 -0400 Subject: [PATCH 45/58] Add tests for `RFMH` in `{,de}compress`. --- tests/include/compressors_regression.hpp | 9 +- tests/src/compressors_regression.cpp | 52 ++- tests/src/test_compressors.cpp | 398 +++++++++++++---------- 3 files changed, 276 insertions(+), 183 deletions(-) diff --git a/tests/include/compressors_regression.hpp b/tests/include/compressors_regression.hpp index 07f632eec4..a1adfe0ee7 100644 --- a/tests/include/compressors_regression.hpp +++ b/tests/include/compressors_regression.hpp @@ -5,6 +5,7 @@ #include +#include "format.hpp" #include "utilities.hpp" namespace mgard { @@ -13,18 +14,22 @@ namespace regression { //! Compress an array using a Huffman tree. //! +//!\param[in] header Header for the self-describing buffer. //!\param[in] src Array to be compressed. //!\param[in] srcLen Size of array (number of elements) to be compressed. -MemoryBuffer compress_memory_huffman(long int const *const src, +MemoryBuffer compress_memory_huffman(const pb::Header &header, + long int const *const src, const std::size_t srcLen); //! Decompress an array compressed with `compress_memory_huffman`. //! +//!\param[in] header Header parsed from the original self-describing buffer. //!\param[in] src Compressed array. //!\param[in] srcLen Size in bytes of the compressed array. //!\param[out] dst Decompressed array. //!\param[in] dstLen Size in bytes of the decompressed array. -void decompress_memory_huffman(unsigned char const *const src, +void decompress_memory_huffman(const pb::Header &header, + unsigned char const *const src, const std::size_t srcLen, long int *const dst, const std::size_t dstLen); diff --git a/tests/src/compressors_regression.cpp b/tests/src/compressors_regression.cpp index 4284f31b31..a34a192004 100644 --- a/tests/src/compressors_regression.cpp +++ b/tests/src/compressors_regression.cpp @@ -26,11 +26,31 @@ std::size_t hit_buffer_size(const std::size_t nbits) { return nbits / CHAR_BIT + sizeof(unsigned int); } +MemoryBuffer compress_serialized(const pb::Header &header, + unsigned char const *const p, + const std::size_t n) { + assert(header.encoding().serialization() == pb::Encoding::DEPRECATED); + + switch (header.encoding().compressor()) { + case pb::Encoding::CPU_HUFFMAN_ZLIB: + return compress_memory_z(const_cast(p), n); + case pb::Encoding::CPU_HUFFMAN_ZSTD: +#ifdef MGARD_ZSTD + return compress_memory_zstd(p, n); +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif + default: + throw std::runtime_error("unrecognized lossless compressor"); + } +} + } // namespace // This code also makes endianness assumptions. -MemoryBuffer compress_memory_huffman(long int const *const src, +MemoryBuffer compress_memory_huffman(const pb::Header &header, + long int const *const src, const std::size_t srcLen) { HuffmanEncodedStream encoded = mgard::regression::huffman_encoding(src, srcLen); @@ -65,13 +85,9 @@ MemoryBuffer compress_memory_huffman(long int const *const src, std::memcpy(bufp, encoded.missed.data.get(), encoded.missed.size); bufp += encoded.missed.size; -#ifndef MGARD_ZSTD const MemoryBuffer out_data = - compress_memory_z(payload, npayload); -#else - const MemoryBuffer out_data = - compress_memory_zstd(payload, npayload); -#endif + compress_serialized(header, payload, npayload); + delete[] payload; bufp = nullptr; @@ -95,9 +111,12 @@ MemoryBuffer compress_memory_huffman(long int const *const src, return MemoryBuffer(buffer, bufferLen); } -void decompress_memory_huffman(unsigned char const *const src, +void decompress_memory_huffman(const pb::Header &header, + unsigned char const *const src, const std::size_t srcLen, long int *const dst, const std::size_t dstLen) { + assert(header.encoding().serialization() == pb::Encoding::DEPRECATED); + std::size_t const *const sizes = reinterpret_cast(src); const std::size_t nfrequencies = sizes[0]; const std::size_t nbits = sizes[1]; @@ -112,12 +131,21 @@ void decompress_memory_huffman(unsigned char const *const src, unsigned char *const dst_ = buffer.data.get(); const std::size_t dstLen_ = buffer.size; -#ifndef MGARD_ZSTD - decompress_memory_z(const_cast(src_), srcLen_, - dst_, dstLen_); + switch (header.encoding().compressor()) { + case pb::Encoding::CPU_HUFFMAN_ZLIB: + decompress_memory_z(const_cast(src_), srcLen_, + dst_, dstLen_); + break; + case pb::Encoding::CPU_HUFFMAN_ZSTD: +#ifdef MGARD_ZSTD + decompress_memory_zstd(src_, srcLen_, dst_, dstLen_); + break; #else - decompress_memory_zstd(src_, srcLen_, dst_, dstLen_); + throw std::runtime_error("MGARD compiled without ZSTD support"); #endif + default: + throw std::runtime_error("unrecognized lossless compressor"); + } } HuffmanEncodedStream encoded(nbits, nmissed, nfrequencies); diff --git a/tests/src/test_compressors.cpp b/tests/src/test_compressors.cpp index 3f5568dda7..9b7f28cc09 100644 --- a/tests/src/test_compressors.cpp +++ b/tests/src/test_compressors.cpp @@ -1,3 +1,4 @@ +#include "catch2/catch_template_test_macros.hpp" #include "catch2/catch_test_macros.hpp" #include @@ -14,57 +15,72 @@ namespace { -template -void test_huffman_identity(std::default_random_engine &gen, - const std::size_t n) { - std::uniform_int_distribution dis(std::numeric_limits::min()); - const auto f = [&]() -> T { return dis(gen); }; - std::vector src(n); - std::generate(src.begin(), src.end(), f); - std::vector src_(src); - mgard::MemoryBuffer compressed = - mgard::compress_memory_huffman(src_.data(), n); - long int *const decompressed = new long int[n]; - mgard::decompress_memory_huffman(compressed.data.get(), compressed.size, - decompressed, n * sizeof(long int)); - REQUIRE(std::equal(src.begin(), src.end(), decompressed)); - delete[] decompressed; +// Generate a header for use with the deprecated Huffman serialization method. +mgard::pb::Header +deprecated_header(const mgard::pb::Encoding::Compressor compressor) { + mgard::pb::Header header; + header.mutable_quantization()->set_type(mgard::pb::Quantization::INT64_T); + header.mutable_encoding()->set_preprocessor(mgard::pb::Encoding::SHUFFLE); + header.mutable_encoding()->set_compressor(compressor); + header.mutable_encoding()->set_serialization(mgard::pb::Encoding::DEPRECATED); + return header; } void test_huffman_compression_regression(long int const *const src, const std::size_t srcLen) { - const mgard::MemoryBuffer out = - mgard::regression::compress_memory_huffman(src, srcLen); - const mgard::MemoryBuffer out_ = - mgard::compress_memory_huffman(src, srcLen); - - REQUIRE(out.size == out_.size); - unsigned char const *const p = out.data.get(); - unsigned char const *const p_ = out_.data.get(); - REQUIRE(std::equal(p, p + out.size, p_)); + std::vector compressors; + compressors.push_back(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); +#ifdef MGARD_ZSTD + compressors.push_back(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); +#endif + + for (mgard::pb::Encoding::Compressor compressor : compressors) { + const mgard::pb::Header header = deprecated_header(compressor); + const mgard::MemoryBuffer out = + mgard::regression::compress_memory_huffman(header, src, srcLen); + unsigned char const *const p = out.data.get(); + + const mgard::MemoryBuffer out_ = mgard::compress( + header, const_cast(src), srcLen * sizeof(long int)); + unsigned char const *const p_ = out_.data.get(); + + REQUIRE(out.size == out_.size); + REQUIRE(std::equal(p, p + out.size, p_)); + } } void test_huffman_decompression_regression(long int const *const src, const std::size_t srcLen) { - const mgard::MemoryBuffer compressed = - mgard::regression::compress_memory_huffman(src, srcLen); - const mgard::MemoryBuffer compressed_ = - mgard::regression::compress_memory_huffman(src, srcLen); + std::vector compressors; + compressors.push_back(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); +#ifdef MGARD_ZSTD + compressors.push_back(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); +#endif + + for (const mgard::pb::Encoding::Compressor compressor : compressors) { + const mgard::pb::Header header = deprecated_header(compressor); + + const mgard::MemoryBuffer compressed = + mgard::regression::compress_memory_huffman(header, src, srcLen); + const mgard::MemoryBuffer compressed_(compressed.size); - mgard::MemoryBuffer out(srcLen); - mgard::MemoryBuffer out_(srcLen); + unsigned char *const q = compressed.data.get(); + unsigned char *const q_ = compressed_.data.get(); + std::copy(q, q + compressed.size, q_); - unsigned char *const q = compressed.data.get(); - unsigned char *const q_ = compressed_.data.get(); - long int *const p = out.data.get(); - long int *const p_ = out_.data.get(); + mgard::MemoryBuffer out(srcLen); + mgard::MemoryBuffer out_(srcLen); - mgard::regression::decompress_memory_huffman(q, compressed.size, p, - out.size * sizeof(long int)); - mgard::decompress_memory_huffman(q_, compressed_.size, p_, - out_.size * sizeof(long int)); + long int *const p = out.data.get(); + long int *const p_ = out_.data.get(); - REQUIRE(std::equal(p, p + srcLen, p_)); + mgard::regression::decompress_memory_huffman(header, q, compressed.size, p, + out.size * sizeof(long int)); + + mgard::decompress(header, q_, compressed_.size, out_.data.get(), + out_.size * sizeof(long int)); + REQUIRE(std::equal(p, p + srcLen, p_)); + } } void test_hcr_constant(const std::size_t srcLen, const long int q) { @@ -163,15 +179,6 @@ TEST_CASE("Huffman decompression regression", "[compressors] [regression]") { } } -TEST_CASE("Huffman compression", "[compressors] [!mayfail]") { - std::default_random_engine gen(257100); - const std::size_t n = 5000; - SECTION("signed characters") { test_huffman_identity(gen, n); } - SECTION("short integers") { test_huffman_identity(gen, n); } - SECTION("integers") { test_huffman_identity(gen, n); } - SECTION("long integers") { test_huffman_identity(gen, n); } -} - #ifdef MGARD_ZSTD namespace { @@ -232,150 +239,203 @@ TEST_CASE("zlib compression", "[compressors]") { } } -TEST_CASE("compression with header configuration", "[compressors]") { - mgard::pb::Header header; - // TODO: Once Huffman trees can be built for types other than `long int`, use - // something other than `std::int64_t` here. - mgard::populate_defaults(header); +namespace { - const std::size_t ndof = 10000; - std::int64_t *const quantized = new std::int64_t[ndof]; - std::uniform_int_distribution dis(-250, 250); - std::default_random_engine gen(419643); - const auto f = [&]() -> std::int64_t { return dis(gen); }; - std::generate(quantized, quantized + ndof, f); - const std::size_t quantizedLen = ndof * sizeof(*quantized); - // `dst` must have the correct alignment for the quantization type. - std::int64_t *const dst = new std::int64_t[ndof]; - - std::int64_t *const quantized_ = new std::int64_t[ndof]; - std::copy(quantized, quantized + ndof, quantized_); +template +void test_cd_inversion(const mgard::pb::Header &header, + Int const *const quantized, const std::size_t n) { + const std::size_t nbytes = sizeof(Int) * n; + + Int *const quantized_ = new Int[n]; + std::copy(quantized, quantized + n, quantized_); const mgard::MemoryBuffer compressed = - mgard::compress(header, quantized_, quantizedLen); + mgard::compress(header, quantized_, nbytes); delete[] quantized_; - const mgard::pb::Encoding &e = header.encoding(); - REQUIRE(e.preprocessor() == mgard::pb::Encoding::SHUFFLE); -#ifdef MGARD_ZSTD - REQUIRE(e.compressor() == mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); - mgard::regression::decompress_memory_huffman( - compressed.data.get(), compressed.size, dst, quantizedLen); -#else - REQUIRE(e.compressor() == mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); - mgard::decompress_memory_z(compressed.data.get(), compressed.size, - reinterpret_cast(dst), - quantizedLen); -#endif - REQUIRE(std::equal(quantized, quantized + ndof, dst)); - delete[] dst; + Int *const decompressed = new Int[n]; + mgard::decompress(header, compressed.data.get(), compressed.size, + decompressed, nbytes); + REQUIRE(std::equal(quantized, quantized + n, decompressed)); + delete[] decompressed; +} + +template +void test_cd_inversion_constant(const mgard::pb::Header &header, + const std::size_t N, const Int q) { + Int *const quantized = new Int[N]; + std::fill(quantized, quantized + N, q); + test_cd_inversion(header, quantized, N); delete[] quantized; } -TEST_CASE("decompression with header configuration", "[compressors]") { - mgard::pb::Header header; - // TODO: Once Huffman trees can be built for types other than `long int`, use - // something other than `std::int64_t` here. - mgard::populate_defaults(header); +template +void test_cd_inversion_periodic(const mgard::pb::Header &header, + const std::size_t N, const Int q, + const std::size_t period) { + Int *const quantized = new Int[N]; + std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); + test_cd_inversion(header, quantized, N); + delete[] quantized; +} + +template +void test_cd_inversion_random(const mgard::pb::Header &header, + const std::size_t N, const Int a, const Int b, + std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + Int *const quantized = new Int[N]; + std::generate(quantized, quantized + N, [&] { return dis(gen); }); + test_cd_inversion(header, quantized, N); + delete[] quantized; +} + +template +mgard::pb::Quantization::Type type_to_quantization_type(); - const std::size_t ndof = 5000; - std::int64_t *const quantized = new std::int64_t[ndof]; - std::uniform_int_distribution dis(-500, 500); - std::default_random_engine gen(489063); - const auto f = [&]() -> std::int64_t { return dis(gen); }; - std::generate(quantized, quantized + ndof, f); - const std::size_t quantizedLen = ndof * sizeof(*quantized); - // `dst` must have the correct alignment for the quantization type. - std::int64_t *const dst = new std::int64_t[ndof]; +template <> +mgard::pb::Quantization::Type type_to_quantization_type() { + return mgard::pb::Quantization::INT8_T; +} + +template <> +mgard::pb::Quantization::Type type_to_quantization_type() { + return mgard::pb::Quantization::INT16_T; +} +template <> +mgard::pb::Quantization::Type type_to_quantization_type() { + return mgard::pb::Quantization::INT32_T; +} + +template <> +mgard::pb::Quantization::Type type_to_quantization_type() { + return mgard::pb::Quantization::INT64_T; +} + +template +void test_cd_inversion_constant(const mgard::pb::Header &header) { + test_cd_inversion_constant(header, 100, 98); + test_cd_inversion_constant(header, 1000, 0); + test_cd_inversion_constant(header, 10000, -62); +} + +template +void test_cd_inversion_periodic(const mgard::pb::Header &header) { + test_cd_inversion_periodic(header, 100, -5, 3); + test_cd_inversion_periodic(header, 1000, 86, 60); + test_cd_inversion_periodic(header, 10000, 7, 62); +} + +template +void test_cd_inversion_random(const mgard::pb::Header &header) { + std::default_random_engine gen(894584); + test_cd_inversion_random(header, 100, 0, 3, gen); + test_cd_inversion_random(header, 1000, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_cd_inversion_random(header, 10000, -110, 110, gen); +} + +template <> +void test_cd_inversion_random(const mgard::pb::Header &header) { + std::default_random_engine gen(952426); + test_cd_inversion_random(header, 100, -1, 1, gen); + // In the deprecated Huffman encoding function, the missed symbols are cast + // from `long int` to `int`. + test_cd_inversion_random(header, 1000, + std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_cd_inversion_random(header, 10000, 0, 250, gen); +} + +template +void test_cd_inversion(const mgard::pb::Header &header) { + SECTION("constant data") { test_cd_inversion_constant(header); } + SECTION("periodic data") { test_cd_inversion_periodic(header); } + SECTION("random data") { test_cd_inversion_random(header); } +} + +} // namespace + +TEMPLATE_TEST_CASE("`compress`/`decompress` inversion", "[compressors]", + std::int8_t, std::int16_t, std::int32_t, std::int64_t) { + mgard::pb::Header header; + mgard::populate_defaults(header); + mgard::pb::Quantization &q = *header.mutable_quantization(); mgard::pb::Encoding &e = *header.mutable_encoding(); - SECTION("noop") { - e.set_compressor(mgard::pb::Encoding::NOOP_COMPRESSOR); - - const std::size_t srcLen = quantizedLen; - unsigned char *const src = new unsigned char[srcLen]; - { - unsigned char const *const p = - reinterpret_cast(quantized); - std::copy(p, p + quantizedLen, src); - } - mgard::decompress(header, src, srcLen, - reinterpret_cast(dst), quantizedLen); - delete[] src; - REQUIRE(std::equal(quantized, quantized + ndof, dst)); + const mgard::pb::Quantization::Type qtype = + type_to_quantization_type(); + q.set_type(qtype); + + SECTION("`CPU_ZLIB`") { + e.set_compressor(mgard::pb::Encoding::CPU_ZLIB); + test_cd_inversion(header); } - SECTION("zlib") { - e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); +#ifdef MGARD_ZSTD + SECTION("`CPU_ZSTD`") { + e.set_compressor(mgard::pb::Encoding::CPU_ZSTD); + test_cd_inversion(header); + } +#endif - const mgard::MemoryBuffer out = - mgard::compress_memory_z(quantized, quantizedLen); + // The deprecated Huffman serialization method requires the quantization type + // to be `std::int64_t`. + if (qtype == mgard::pb::Quantization::INT64_T) { + SECTION("`CPU_HUFFMAN_ZLIB` with `DEPRECATED`") { + e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); + e.set_serialization(mgard::pb::Encoding::DEPRECATED); + test_cd_inversion(header); + } - const std::size_t srcLen = out.size * sizeof(*out.data.get()); - unsigned char *const src = new unsigned char[srcLen]; - { - unsigned char const *const p = out.data.get(); - std::copy(p, p + srcLen, src); +#ifdef MGARD_ZSTD + SECTION("`CPU_HUFFMAN_ZSTD` with `DEPRECATED`") { + e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); + e.set_serialization(mgard::pb::Encoding::DEPRECATED); + test_cd_inversion(header); } - mgard::decompress(header, src, srcLen, - reinterpret_cast(dst), quantizedLen); - delete[] src; - REQUIRE(std::equal(quantized, quantized + ndof, dst)); +#endif + } + + SECTION("`CPU_HUFFMAN_ZLIB` with `RFMH`") { + e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); + e.set_serialization(mgard::pb::Encoding::RFMH); + test_cd_inversion(header); } #ifdef MGARD_ZSTD - SECTION("zstd") { + SECTION("`CPU_HUFFMAN_ZSTD` with `RFMH`") { e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); - - std::int64_t *const quantized_ = new std::int64_t[ndof]; - std::copy(quantized, quantized + ndof, quantized_); - const mgard::MemoryBuffer out = - mgard::regression::compress_memory_huffman(quantized_, ndof); - delete[] quantized_; - - const std::size_t srcLen = out.size; - unsigned char *const src = new unsigned char[srcLen]; - { - unsigned char const *const p = out.data.get(); - std::copy(p, p + srcLen, src); - } - mgard::decompress(header, src, srcLen, - reinterpret_cast(dst), quantizedLen); - delete[] src; - REQUIRE(std::equal(quantized, quantized + ndof, dst)); + e.set_serialization(mgard::pb::Encoding::RFMH); + test_cd_inversion(header); } #endif - - delete[] dst; - delete[] quantized; } -TEST_CASE("compression and decompression with header", "[compressors]") { - mgard::pb::Header header; - // TODO: Once Huffman trees can be built for types other than `long int`, use - // something other than `std::int64_t` here. - mgard::populate_defaults(header); - - const std::size_t ndof = 2500; - std::int64_t *const quantized = new std::int64_t[ndof]; - std::uniform_int_distribution dis(-1000, 1000); - std::default_random_engine gen(995719); - const auto f = [&]() -> std::int64_t { return dis(gen); }; - std::generate(quantized, quantized + ndof, f); - const std::size_t quantizedLen = ndof * sizeof(*quantized); - // `dst` must have the correct alignment for the quantization type. - std::int64_t *const dst = new std::int64_t[ndof]; - - std::int64_t *const quantized_ = new std::int64_t[ndof]; - std::copy(quantized, quantized + ndof, quantized_); - const mgard::MemoryBuffer compressed = - mgard::compress(header, quantized_, quantizedLen); - delete[] quantized_; - - mgard::decompress(header, compressed.data.get(), compressed.size, dst, - quantizedLen); +// In the deprecated Huffman encoding function, the missed symbols are cast from +// `long int` to `int`. +TEST_CASE("deprecated Huffman inversion", "[compressors] [!shouldfail]") { + std::default_random_engine gen(257100); + const std::int64_t a = + 2 * static_cast(std::numeric_limits::min()); + const std::int64_t b = + 2 * static_cast(std::numeric_limits::max()); + + SECTION("`CPU_HUFFMAN_ZLIB` with `DEPRECATED`") { + // Conceivably this could pass if all the generated `std::int64_t`s are + // representable as `int`s. + test_cd_inversion_random( + deprecated_header(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB), 5000, a, b, + gen); + } - REQUIRE(std::equal(quantized, quantized + ndof, dst)); - delete[] dst; - delete[] quantized; +#ifdef MGARD_ZSTD + SECTION("`CPU_HUFFMAN_ZSTD` with `DEPRECATED`") { + // Conceivably this could pass if all the generated `std::int64_t`s are + // representable as `int`s. + test_cd_inversion_random( + deprecated_header(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD), 5000, a, b, + gen); + } +#endif } From 309823d74dac5a3721967eb65fb41590ae7aa627 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Mon, 20 Jun 2022 11:11:14 -0400 Subject: [PATCH 46/58] Rename `compressors.hpp` to `lossless.hpp`. --- CMakeLists.txt | 2 +- include/compress.tpp | 2 +- include/compress_internal.tpp | 2 +- include/{compressors.hpp => lossless.hpp} | 4 ++-- src/cuda/LosslessCompression.cu | 2 +- src/huffman.cpp | 2 +- src/{compressors.cpp => lossless.cpp} | 2 +- tests/CMakeLists.txt | 4 ++-- .../{compressors_regression.hpp => lossless_regression.hpp} | 0 .../{compressors_regression.cpp => lossless_regression.cpp} | 5 ++--- tests/src/{test_compressors.cpp => test_lossless.cpp} | 4 ++-- 11 files changed, 14 insertions(+), 15 deletions(-) rename include/{compressors.hpp => lossless.hpp} (98%) rename src/{compressors.cpp => lossless.cpp} (99%) rename tests/include/{compressors_regression.hpp => lossless_regression.hpp} (100%) rename tests/src/{compressors_regression.cpp => lossless_regression.cpp} (98%) rename tests/src/{test_compressors.cpp => test_lossless.cpp} (99%) diff --git a/CMakeLists.txt b/CMakeLists.txt index c92cca399a..fe65fd743b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,7 +203,7 @@ set( src/compress_internal.cpp src/utilities.cpp src/huffman.cpp - src/compressors.cpp + src/lossless.cpp src/format.cpp ) diff --git a/include/compress.tpp b/include/compress.tpp index 867b6cfa9f..5d22b1c25a 100644 --- a/include/compress.tpp +++ b/include/compress.tpp @@ -20,9 +20,9 @@ #include "MGARDConfig.hpp" #include "TensorMultilevelCoefficientQuantizer.hpp" #include "TensorNorms.hpp" -#include "compressors.hpp" #include "decompose.hpp" #include "format.hpp" +#include "lossless.hpp" #include "quantize.hpp" #include "shuffle.hpp" diff --git a/include/compress_internal.tpp b/include/compress_internal.tpp index fd16dcd331..6aca8124af 100644 --- a/include/compress_internal.tpp +++ b/include/compress_internal.tpp @@ -1,8 +1,8 @@ #include #include "compress.hpp" -#include "compressors.hpp" #include "decompose.hpp" +#include "lossless.hpp" #include "quantize.hpp" #include "shuffle.hpp" diff --git a/include/compressors.hpp b/include/lossless.hpp similarity index 98% rename from include/compressors.hpp rename to include/lossless.hpp index c946538b50..d24cee9a0b 100644 --- a/include/compressors.hpp +++ b/include/lossless.hpp @@ -1,5 +1,5 @@ -#ifndef COMPRESSORS_HPP -#define COMPRESSORS_HPP +#ifndef LOSSLESS_HPP +#define LOSSLESS_HPP //!\file //!\brief Lossless compressors for quantized multilevel coefficients. diff --git a/src/cuda/LosslessCompression.cu b/src/cuda/LosslessCompression.cu index feb61ab2d4..072d6be598 100644 --- a/src/cuda/LosslessCompression.cu +++ b/src/cuda/LosslessCompression.cu @@ -5,7 +5,7 @@ * Date: September 27, 2021 */ -// #include "compressors.hpp" +// #include "lossless.hpp" #include "cuda/Common.h" #include "cuda/CommonInternal.h" #include "cuda/LosslessCompression.h" diff --git a/src/huffman.cpp b/src/huffman.cpp index 1ebb2fbded..4ffd4a05af 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -11,8 +11,8 @@ #include #include -#include "compressors.hpp" #include "huffman.hpp" +#include "lossless.hpp" namespace mgard { diff --git a/src/compressors.cpp b/src/lossless.cpp similarity index 99% rename from src/compressors.cpp rename to src/lossless.cpp index a67279292f..3244e35eb2 100644 --- a/src/compressors.cpp +++ b/src/lossless.cpp @@ -1,4 +1,4 @@ -#include "compressors.hpp" +#include "lossless.hpp" #include #include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f625d0a148..80747cefca 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -18,8 +18,8 @@ set( "src/test_decompose.cpp" "src/test_format.cpp" "src/test_quantize.cpp" - "src/compressors_regression.cpp" - "src/test_compressors.cpp" + "src/lossless_regression.cpp" + "src/test_lossless.cpp" "src/test_CompressedDataset.cpp" "src/huffman_regression.cpp" "src/test_huffman.cpp" diff --git a/tests/include/compressors_regression.hpp b/tests/include/lossless_regression.hpp similarity index 100% rename from tests/include/compressors_regression.hpp rename to tests/include/lossless_regression.hpp diff --git a/tests/src/compressors_regression.cpp b/tests/src/lossless_regression.cpp similarity index 98% rename from tests/src/compressors_regression.cpp rename to tests/src/lossless_regression.cpp index a34a192004..e84725cfc3 100644 --- a/tests/src/compressors_regression.cpp +++ b/tests/src/lossless_regression.cpp @@ -1,12 +1,11 @@ -#include "compressors_regression.hpp" +#include "lossless_regression.hpp" #include #include -#include "compressors.hpp" -#include "compressors_regression.hpp" #include "huffman.hpp" #include "huffman_regression.hpp" +#include "lossless.hpp" namespace mgard { diff --git a/tests/src/test_compressors.cpp b/tests/src/test_lossless.cpp similarity index 99% rename from tests/src/test_compressors.cpp rename to tests/src/test_lossless.cpp index 9b7f28cc09..fa9198709f 100644 --- a/tests/src/test_compressors.cpp +++ b/tests/src/test_lossless.cpp @@ -7,9 +7,9 @@ #include #include -#include "compressors.hpp" -#include "compressors_regression.hpp" #include "format.hpp" +#include "lossless.hpp" +#include "lossless_regression.hpp" #include "testing_utilities.hpp" From 0e6c563d181c4a25d7c65f9436680bd85a97e8e6 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Mon, 20 Jun 2022 11:34:21 -0400 Subject: [PATCH 47/58] Separate lossless compressor implementations. --- CMakeLists.txt | 6 +- src/{lossless.cpp => lossless_dispatcher.cpp} | 131 ------------------ src/lossless_zlib.cpp | 88 ++++++++++++ src/lossless_zstd.cpp | 60 ++++++++ 4 files changed, 153 insertions(+), 132 deletions(-) rename src/{lossless.cpp => lossless_dispatcher.cpp} (77%) create mode 100644 src/lossless_zlib.cpp create mode 100644 src/lossless_zstd.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index fe65fd743b..03d1b09a68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,9 +203,13 @@ set( src/compress_internal.cpp src/utilities.cpp src/huffman.cpp - src/lossless.cpp + src/lossless_zlib.cpp + src/lossless_dispatcher.cpp src/format.cpp ) +if(zstd_FOUND) + list(APPEND MGARD_LIBRARY_CPP src/lossless_zstd.cpp) +endif() set(MAXIMUM_DIMENSION 4 CACHE STRING "Maximum supported dimension for self-describing decompression.") diff --git a/src/lossless.cpp b/src/lossless_dispatcher.cpp similarity index 77% rename from src/lossless.cpp rename to src/lossless_dispatcher.cpp index 3244e35eb2..97e565dc25 100644 --- a/src/lossless.cpp +++ b/src/lossless_dispatcher.cpp @@ -11,143 +11,12 @@ #include #include -#include - #include "format.hpp" #include "huffman.hpp" #include "utilities.hpp" -#ifdef MGARD_ZSTD -#include -#endif - namespace mgard { -#ifdef MGARD_ZSTD -/*! CHECK - * Check that the condition holds. If it doesn't print a message and die. - */ -#define CHECK(cond, ...) \ - do { \ - if (!(cond)) { \ - fprintf(stderr, "%s:%d CHECK(%s) failed: ", __FILE__, __LINE__, #cond); \ - fprintf(stderr, "" __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - exit(1); \ - } \ - } while (0) - -/*! CHECK_ZSTD - * Check the zstd error code and die if an error occurred after printing a - * message. - */ -#define CHECK_ZSTD(fn, ...) \ - do { \ - size_t const err = (fn); \ - CHECK(!ZSTD_isError(err), "%s", ZSTD_getErrorName(err)); \ - } while (0) - -MemoryBuffer compress_memory_zstd(void const *const src, - const std::size_t srcLen) { - const size_t cBuffSize = ZSTD_compressBound(srcLen); - unsigned char *const buffer = new unsigned char[cBuffSize]; - const std::size_t cSize = ZSTD_compress(buffer, cBuffSize, src, srcLen, 1); - CHECK_ZSTD(cSize); - return MemoryBuffer(buffer, cSize); -} -#endif - -MemoryBuffer compress_memory_z(void z_const *const src, - const std::size_t srcLen) { - const std::size_t BUFSIZE = 2048 * 1024; - std::vector buffers; - std::vector bufferLengths; - - z_stream strm; - strm.zalloc = Z_NULL; - strm.zfree = Z_NULL; - strm.next_in = static_cast(src); - strm.avail_in = srcLen; - buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); - bufferLengths.push_back(strm.avail_out = BUFSIZE); - - deflateInit(&strm, Z_BEST_COMPRESSION); - - while (strm.avail_in != 0) { - [[maybe_unused]] const int res = deflate(&strm, Z_NO_FLUSH); - assert(res == Z_OK); - if (strm.avail_out == 0) { - buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); - bufferLengths.push_back(strm.avail_out = BUFSIZE); - } - } - - int res = Z_OK; - while (res == Z_OK) { - if (strm.avail_out == 0) { - buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); - bufferLengths.push_back(strm.avail_out = BUFSIZE); - } - res = deflate(&strm, Z_FINISH); - } - - assert(res == Z_STREAM_END); - bufferLengths.back() -= strm.avail_out; - // Could just do `nbuffers * BUFSIZE - strm.avail_out`. - const std::size_t bufferLen = - std::accumulate(bufferLengths.begin(), bufferLengths.end(), 0); - unsigned char *const buffer = new unsigned char[bufferLen]; - { - const std::size_t nbuffers = buffers.size(); - unsigned char *p = buffer; - for (std::size_t i = 0; i < nbuffers; ++i) { - unsigned char const *const buffer = buffers.at(i); - const std::size_t bufferLength = bufferLengths.at(i); - std::copy(buffer, buffer + bufferLength, p); - p += bufferLength; - delete[] buffer; - } - } - deflateEnd(&strm); - - return MemoryBuffer(buffer, bufferLen); -} - -void decompress_memory_z(void z_const *const src, const std::size_t srcLen, - unsigned char *const dst, const std::size_t dstLen) { - z_stream strm = {}; - strm.total_in = strm.avail_in = srcLen; - strm.total_out = strm.avail_out = dstLen; - strm.next_in = static_cast(src); - strm.next_out = reinterpret_cast(dst); - - strm.zalloc = Z_NULL; - strm.zfree = Z_NULL; - strm.opaque = Z_NULL; - - [[maybe_unused]] int res; - res = inflateInit2(&strm, (15 + 32)); // 15 window bits, and the +32 tells - // zlib to to detect if using gzip or - // zlib - assert(res == Z_OK); - res = inflate(&strm, Z_FINISH); - assert(res == Z_STREAM_END); - res = inflateEnd(&strm); - assert(res == Z_OK); -} - -#ifdef MGARD_ZSTD -void decompress_memory_zstd(void const *const src, const std::size_t srcLen, - unsigned char *const dst, - const std::size_t dstLen) { - size_t const dSize = ZSTD_decompress(dst, dstLen, src, srcLen); - CHECK_ZSTD(dSize); - - /* When zstd knows the content size, it will error if it doesn't match. */ - CHECK(dstLen == dSize, "Impossible because zstd will check this condition!"); -} -#endif - namespace { template diff --git a/src/lossless_zlib.cpp b/src/lossless_zlib.cpp new file mode 100644 index 0000000000..272cb6e31c --- /dev/null +++ b/src/lossless_zlib.cpp @@ -0,0 +1,88 @@ +#include "lossless.hpp" + +#include +#include +#include + +namespace mgard { + +MemoryBuffer compress_memory_z(void z_const *const src, + const std::size_t srcLen) { + const std::size_t BUFSIZE = 2048 * 1024; + std::vector buffers; + std::vector bufferLengths; + + z_stream strm; + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.next_in = static_cast(src); + strm.avail_in = srcLen; + buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); + bufferLengths.push_back(strm.avail_out = BUFSIZE); + + deflateInit(&strm, Z_BEST_COMPRESSION); + + while (strm.avail_in != 0) { + [[maybe_unused]] const int res = deflate(&strm, Z_NO_FLUSH); + assert(res == Z_OK); + if (strm.avail_out == 0) { + buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); + bufferLengths.push_back(strm.avail_out = BUFSIZE); + } + } + + int res = Z_OK; + while (res == Z_OK) { + if (strm.avail_out == 0) { + buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); + bufferLengths.push_back(strm.avail_out = BUFSIZE); + } + res = deflate(&strm, Z_FINISH); + } + + assert(res == Z_STREAM_END); + bufferLengths.back() -= strm.avail_out; + // Could just do `nbuffers * BUFSIZE - strm.avail_out`. + const std::size_t bufferLen = + std::accumulate(bufferLengths.begin(), bufferLengths.end(), 0); + unsigned char *const buffer = new unsigned char[bufferLen]; + { + const std::size_t nbuffers = buffers.size(); + unsigned char *p = buffer; + for (std::size_t i = 0; i < nbuffers; ++i) { + unsigned char const *const buffer = buffers.at(i); + const std::size_t bufferLength = bufferLengths.at(i); + std::copy(buffer, buffer + bufferLength, p); + p += bufferLength; + delete[] buffer; + } + } + deflateEnd(&strm); + + return MemoryBuffer(buffer, bufferLen); +} + +void decompress_memory_z(void z_const *const src, const std::size_t srcLen, + unsigned char *const dst, const std::size_t dstLen) { + z_stream strm = {}; + strm.total_in = strm.avail_in = srcLen; + strm.total_out = strm.avail_out = dstLen; + strm.next_in = static_cast(src); + strm.next_out = reinterpret_cast(dst); + + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + + [[maybe_unused]] int res; + res = inflateInit2(&strm, (15 + 32)); // 15 window bits, and the +32 tells + // zlib to to detect if using gzip or + // zlib + assert(res == Z_OK); + res = inflate(&strm, Z_FINISH); + assert(res == Z_STREAM_END); + res = inflateEnd(&strm); + assert(res == Z_OK); +} + +} // namespace mgard diff --git a/src/lossless_zstd.cpp b/src/lossless_zstd.cpp new file mode 100644 index 0000000000..4b7fc4bf28 --- /dev/null +++ b/src/lossless_zstd.cpp @@ -0,0 +1,60 @@ +#include "lossless.hpp" + +#include +#include + +#ifndef MGARD_ZSTD +#error "This file requires ZSTD." +#endif + +#include + +namespace mgard { + +/*! CHECK + * Check that the condition holds. If it doesn't print a message and die. + */ +#define CHECK(cond, ...) \ + do { \ + if (!(cond)) { \ + std::fprintf(stderr, "%s:%d CHECK(%s) failed: ", __FILE__, __LINE__, \ + #cond); \ + std::fprintf(stderr, "" __VA_ARGS__); \ + std::fprintf(stderr, "\n"); \ + std::exit(1); \ + } \ + } while (0) + +/*! CHECK_ZSTD + * Check the zstd error code and die if an error occurred after printing a + * message. + */ +#define CHECK_ZSTD(fn, ...) \ + do { \ + std::size_t const err = (fn); \ + CHECK(!ZSTD_isError(err), "%s", ZSTD_getErrorName(err)); \ + } while (0) + +MemoryBuffer compress_memory_zstd(void const *const src, + const std::size_t srcLen) { + const std::size_t cBuffSize = ZSTD_compressBound(srcLen); + unsigned char *const buffer = new unsigned char[cBuffSize]; + const std::size_t cSize = ZSTD_compress(buffer, cBuffSize, src, srcLen, 1); + CHECK_ZSTD(cSize); + return MemoryBuffer(buffer, cSize); +} + +void decompress_memory_zstd(void const *const src, const std::size_t srcLen, + unsigned char *const dst, + const std::size_t dstLen) { + std::size_t const dSize = ZSTD_decompress(dst, dstLen, src, srcLen); + CHECK_ZSTD(dSize); + + /* When zstd knows the content size, it will error if it doesn't match. */ + CHECK(dstLen == dSize, "Impossible because zstd will check this condition!"); +} + +#undef CHECK_ZSTD +#undef CHECK + +} // namespace mgard From b697831bb3a3306f03dc6723ae7242ae82b7a467 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Mon, 20 Jun 2022 11:46:19 -0400 Subject: [PATCH 48/58] Contain `z_const` casts to `lossless_zlib.cpp`. --- include/lossless.hpp | 14 +++---- src/huffman.cpp | 6 +-- src/lossless_dispatcher.cpp | 62 +++++++++++++++---------------- src/lossless_zlib.cpp | 10 +++-- tests/src/lossless_regression.cpp | 5 +-- 5 files changed, 46 insertions(+), 51 deletions(-) diff --git a/include/lossless.hpp b/include/lossless.hpp index d24cee9a0b..3f5a4f3fb8 100644 --- a/include/lossless.hpp +++ b/include/lossless.hpp @@ -5,11 +5,6 @@ #include -// For `z_const`. -#include - -#include - #include "proto/mgard.pb.h" #include "utilities.hpp" @@ -38,7 +33,7 @@ void decompress_memory_zstd(void const *const src, const std::size_t srcLen, //! //!\param src Array to be compressed. //!\param srcLen Size in bytes of the array to be compressed. -MemoryBuffer compress_memory_z(void z_const *const src, +MemoryBuffer compress_memory_z(void const *const src, const std::size_t srcLen); //! Decompress an array with `compress_memory_z`. @@ -47,7 +42,7 @@ MemoryBuffer compress_memory_z(void z_const *const src, //!\param srcLen Size in bytes of the compressed array data //!\param dst Decompressed array. //!\param dstLen Size in bytes of the decompressed array. -void decompress_memory_z(void z_const *const src, const std::size_t srcLen, +void decompress_memory_z(void const *const src, const std::size_t srcLen, unsigned char *const dst, const std::size_t dstLen); //! Compress an array of quantized multilevel coefficients. @@ -57,7 +52,8 @@ void decompress_memory_z(void z_const *const src, const std::size_t srcLen, //!\param[in] header Header for the self-describing buffer. //!\param[in] src Array of quantized multilevel coefficients. //!\param[in] srcLen Size in bytes of the input array. -MemoryBuffer compress(const pb::Header &header, void *const src, +MemoryBuffer compress(const pb::Header &header, + void const *const src, const std::size_t srcLen); //! Decompress an array of quantized multilevel coefficients. @@ -69,7 +65,7 @@ MemoryBuffer compress(const pb::Header &header, void *const src, //!\param[in] srcLen Size in bytes of the compressed array. //!\param[out] dst Decompressed array. //!\param[in] dstLen Size in bytes of the decompressed array. -void decompress(const pb::Header &header, void *const src, +void decompress(const pb::Header &header, void const *const src, const std::size_t srcLen, void *const dst, const std::size_t dstLen); diff --git a/src/huffman.cpp b/src/huffman.cpp index 4ffd4a05af..620da655bc 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -66,8 +66,7 @@ compress_serialized_huffman(const pb::Header &header, const MemoryBuffer &payload) { switch (header.encoding().compressor()) { case pb::Encoding::CPU_HUFFMAN_ZLIB: - return compress_memory_z( - const_cast(payload.data.get()), payload.size); + return compress_memory_z(payload.data.get(), payload.size); case pb::Encoding::CPU_HUFFMAN_ZSTD: #ifdef MGARD_ZSTD return compress_memory_zstd(payload.data.get(), payload.size); @@ -147,8 +146,7 @@ HuffmanEncodedStream decompress_deserialize(const pb::Header &header, switch (header.encoding().compressor()) { case pb::Encoding::CPU_HUFFMAN_ZLIB: - decompress_memory_z(const_cast(src_), srcLen_, - dst_, dstLen_); + decompress_memory_z(src_, srcLen_, dst_, dstLen_); break; case pb::Encoding::CPU_HUFFMAN_ZSTD: #ifdef MGARD_ZSTD diff --git a/src/lossless_dispatcher.cpp b/src/lossless_dispatcher.cpp index 97e565dc25..33dafb58b2 100644 --- a/src/lossless_dispatcher.cpp +++ b/src/lossless_dispatcher.cpp @@ -32,7 +32,7 @@ MemoryBuffer compress_huffman_C_rfmh_(const pb::Header &header, // `C` being either ZSTD or `zlib`. MemoryBuffer compress_huffman_C_rfmh(const pb::Header &header, - void *const src, + void const *const src, const std::size_t srcLen) { assert(header.encoding().serialization() == pb::Encoding::RFMH); @@ -51,7 +51,7 @@ MemoryBuffer compress_huffman_C_rfmh(const pb::Header &header, } MemoryBuffer -compress_huffman_C_deprecated(const pb::Header &header, void *const src, +compress_huffman_C_deprecated(const pb::Header &header, void const *const src, const std::size_t srcLen) { check_quantization_buffer(header, src, srcLen); @@ -75,18 +75,16 @@ compress_huffman_C_deprecated(const pb::Header &header, void *const src, srcLen / sizeof(long int))); } -MemoryBuffer -compress_huffman_zlib_deprecated(const pb::Header &header, void *const src, - const std::size_t srcLen) { +MemoryBuffer compress_huffman_zlib_deprecated( + const pb::Header &header, void const *const src, const std::size_t srcLen) { assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); return compress_huffman_C_deprecated(header, src, srcLen); } #ifdef MGARD_ZSTD -MemoryBuffer -compress_huffman_zstd_deprecated(const pb::Header &header, void *const src, - const std::size_t srcLen) { +MemoryBuffer compress_huffman_zstd_deprecated( + const pb::Header &header, void const *const src, const std::size_t srcLen) { assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); return compress_huffman_C_deprecated(header, src, srcLen); @@ -118,7 +116,7 @@ MemoryBuffer concatenate_nhuffman_and_compressed( } // namespace MemoryBuffer -compress_huffman_zlib_rfmh(const pb::Header &header, void *const src, +compress_huffman_zlib_rfmh(const pb::Header &header, void const *const src, const std::size_t srcLen) { assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); assert(header.encoding().serialization() == pb::Encoding::RFMH); @@ -132,7 +130,7 @@ compress_huffman_zlib_rfmh(const pb::Header &header, void *const src, #ifdef MGARD_ZSTD MemoryBuffer -compress_huffman_zstd_rfmh(const pb::Header &header, void *const src, +compress_huffman_zstd_rfmh(const pb::Header &header, void const *const src, const std::size_t srcLen) { assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); assert(header.encoding().serialization() == pb::Encoding::RFMH); @@ -145,7 +143,7 @@ compress_huffman_zstd_rfmh(const pb::Header &header, void *const src, #endif MemoryBuffer compress_huffman_zlib(const pb::Header &header, - void *const src, + void const *const src, const std::size_t srcLen) { assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); @@ -161,7 +159,7 @@ MemoryBuffer compress_huffman_zlib(const pb::Header &header, #ifdef MGARD_ZSTD MemoryBuffer compress_huffman_zstd(const pb::Header &header, - void *const src, + void const *const src, const std::size_t srcLen) { assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); @@ -178,7 +176,8 @@ MemoryBuffer compress_huffman_zstd(const pb::Header &header, } // namespace -MemoryBuffer compress(const pb::Header &header, void *const src, +MemoryBuffer compress(const pb::Header &header, + void const *const src, const std::size_t srcLen) { switch (header.encoding().compressor()) { case pb::Encoding::CPU_ZLIB: @@ -202,8 +201,8 @@ MemoryBuffer compress(const pb::Header &header, void *const src, } } -void decompress_noop(void *const src, const std::size_t srcLen, void *const dst, - const std::size_t dstLen) { +void decompress_noop(void const *const src, const std::size_t srcLen, + void *const dst, const std::size_t dstLen) { if (srcLen != dstLen) { throw std::invalid_argument("source and destination lengths must be equal"); } @@ -257,7 +256,8 @@ void decompress_huffman_C_rfmh(const pb::Header &header, } } -void decompress_huffman_C_deprecated(const pb::Header &header, void *const src, +void decompress_huffman_C_deprecated(const pb::Header &header, + void const *const src, const std::size_t srcLen, void *const dst, const std::size_t dstLen) { check_quantization_buffer(header, dst, dstLen); @@ -291,7 +291,7 @@ void decompress_huffman_C_deprecated(const pb::Header &header, void *const src, } void decompress_huffman_zlib_deprecated(const pb::Header &header, - void *const src, + void const *const src, const std::size_t srcLen, void *const dst, const std::size_t dstLen) { @@ -302,7 +302,7 @@ void decompress_huffman_zlib_deprecated(const pb::Header &header, #ifdef MGARD_ZSTD void decompress_huffman_zstd_deprecated(const pb::Header &header, - void *const src, + void const *const src, const std::size_t srcLen, void *const dst, const std::size_t dstLen) { @@ -312,7 +312,8 @@ void decompress_huffman_zstd_deprecated(const pb::Header &header, } #endif -void decompress_huffman_zlib_rfmh(const pb::Header &header, void *const src, +void decompress_huffman_zlib_rfmh(const pb::Header &header, + void const *const src, const std::size_t srcLen, void *const dst, const std::size_t dstLen) { assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); @@ -321,15 +322,15 @@ void decompress_huffman_zlib_rfmh(const pb::Header &header, void *const src, BufferWindow window(src, srcLen); // Read theSsze in bytes of the serialized Huffman tree. MemoryBuffer encoded(read_header_size(window)); - decompress_memory_z(const_cast(window.current), - window.end - window.current, encoded.data.get(), - encoded.size); + decompress_memory_z(window.current, window.end - window.current, + encoded.data.get(), encoded.size); return decompress_huffman_C_rfmh(header, encoded, dst, dstLen); } #ifdef MGARD_ZSTD -void decompress_huffman_zstd_rfmh(const pb::Header &header, void *const src, +void decompress_huffman_zstd_rfmh(const pb::Header &header, + void const *const src, const std::size_t srcLen, void *const dst, const std::size_t dstLen) { assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); @@ -338,15 +339,14 @@ void decompress_huffman_zstd_rfmh(const pb::Header &header, void *const src, BufferWindow window(src, srcLen); // Read the size in bytes of the serialized Huffman tree. MemoryBuffer encoded(read_header_size(window)); - decompress_memory_zstd(const_cast(window.current), - window.end - window.current, encoded.data.get(), - encoded.size); + decompress_memory_zstd(window.current, window.end - window.current, + encoded.data.get(), encoded.size); return decompress_huffman_C_rfmh(header, encoded, dst, dstLen); } #endif -void decompress_huffman_zlib(const pb::Header &header, void *const src, +void decompress_huffman_zlib(const pb::Header &header, void const *const src, const std::size_t srcLen, void *const dst, const std::size_t dstLen) { assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); @@ -362,7 +362,7 @@ void decompress_huffman_zlib(const pb::Header &header, void *const src, } #ifdef MGARD_ZSTD -void decompress_huffman_zstd(const pb::Header &header, void *const src, +void decompress_huffman_zstd(const pb::Header &header, void const *const src, const std::size_t srcLen, void *const dst, const std::size_t dstLen) { assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); @@ -380,13 +380,13 @@ void decompress_huffman_zstd(const pb::Header &header, void *const src, } // namespace -void decompress(const pb::Header &header, void *const src, +void decompress(const pb::Header &header, void const *const src, const std::size_t srcLen, void *const dst, const std::size_t dstLen) { switch (header.encoding().compressor()) { case pb::Encoding::CPU_ZLIB: - return decompress_memory_z(const_cast(src), srcLen, - static_cast(dst), dstLen); + return decompress_memory_z(src, srcLen, static_cast(dst), + dstLen); case pb::Encoding::CPU_ZSTD: #ifdef MGARD_ZSTD return decompress_memory_zstd( diff --git a/src/lossless_zlib.cpp b/src/lossless_zlib.cpp index 272cb6e31c..9bb30c0b77 100644 --- a/src/lossless_zlib.cpp +++ b/src/lossless_zlib.cpp @@ -4,9 +4,11 @@ #include #include +#include + namespace mgard { -MemoryBuffer compress_memory_z(void z_const *const src, +MemoryBuffer compress_memory_z(void const *const src, const std::size_t srcLen) { const std::size_t BUFSIZE = 2048 * 1024; std::vector buffers; @@ -15,7 +17,7 @@ MemoryBuffer compress_memory_z(void z_const *const src, z_stream strm; strm.zalloc = Z_NULL; strm.zfree = Z_NULL; - strm.next_in = static_cast(src); + strm.next_in = static_cast(const_cast(src)); strm.avail_in = srcLen; buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); bufferLengths.push_back(strm.avail_out = BUFSIZE); @@ -62,12 +64,12 @@ MemoryBuffer compress_memory_z(void z_const *const src, return MemoryBuffer(buffer, bufferLen); } -void decompress_memory_z(void z_const *const src, const std::size_t srcLen, +void decompress_memory_z(void const *const src, const std::size_t srcLen, unsigned char *const dst, const std::size_t dstLen) { z_stream strm = {}; strm.total_in = strm.avail_in = srcLen; strm.total_out = strm.avail_out = dstLen; - strm.next_in = static_cast(src); + strm.next_in = static_cast(const_cast(src)); strm.next_out = reinterpret_cast(dst); strm.zalloc = Z_NULL; diff --git a/tests/src/lossless_regression.cpp b/tests/src/lossless_regression.cpp index e84725cfc3..4320bee17a 100644 --- a/tests/src/lossless_regression.cpp +++ b/tests/src/lossless_regression.cpp @@ -32,7 +32,7 @@ MemoryBuffer compress_serialized(const pb::Header &header, switch (header.encoding().compressor()) { case pb::Encoding::CPU_HUFFMAN_ZLIB: - return compress_memory_z(const_cast(p), n); + return compress_memory_z(p, n); case pb::Encoding::CPU_HUFFMAN_ZSTD: #ifdef MGARD_ZSTD return compress_memory_zstd(p, n); @@ -132,8 +132,7 @@ void decompress_memory_huffman(const pb::Header &header, switch (header.encoding().compressor()) { case pb::Encoding::CPU_HUFFMAN_ZLIB: - decompress_memory_z(const_cast(src_), srcLen_, - dst_, dstLen_); + decompress_memory_z(src_, srcLen_, dst_, dstLen_); break; case pb::Encoding::CPU_HUFFMAN_ZSTD: #ifdef MGARD_ZSTD From 0848131b9825095498d6b11ab7016e84eb7a4ec9 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Mon, 20 Jun 2022 12:16:55 -0400 Subject: [PATCH 49/58] Rename lossless compression functions. --- include/lossless.hpp | 20 ++++++++++---------- src/cuda/LosslessCompression.cu | 4 ++-- src/huffman.cpp | 8 ++++---- src/lossless_dispatcher.cpp | 28 ++++++++++++++-------------- src/lossless_zlib.cpp | 8 ++++---- src/lossless_zstd.cpp | 9 ++++----- tests/src/lossless_regression.cpp | 8 ++++---- tests/src/test_lossless.cpp | 8 ++++---- 8 files changed, 46 insertions(+), 47 deletions(-) diff --git a/include/lossless.hpp b/include/lossless.hpp index 3f5a4f3fb8..b3cafe1175 100644 --- a/include/lossless.hpp +++ b/include/lossless.hpp @@ -16,34 +16,34 @@ namespace mgard { //! //!\param[in] src Array to be compressed. //!\param[in] srcLen Size in bytes of the array to be compressed. -MemoryBuffer compress_memory_zstd(void const *const src, - const std::size_t srcLen); +MemoryBuffer compress_zstd(void const *const src, + const std::size_t srcLen); -//! Decompress an array compressed with `compress_memory_zstd`. +//! Decompress an array compressed with `compress_zstd`. //! //!\param[in] src Compressed array. //!\param[in] srcLen Size in bytes of the compressed array. //!\param[out] dst Decompressed array. //!\param[in] dstLen Size in bytes of the decompressed array. -void decompress_memory_zstd(void const *const src, const std::size_t srcLen, - unsigned char *const dst, const std::size_t dstLen); +void decompress_zstd(void const *const src, const std::size_t srcLen, + unsigned char *const dst, const std::size_t dstLen); #endif //! Compress an array using `zlib`. //! //!\param src Array to be compressed. //!\param srcLen Size in bytes of the array to be compressed. -MemoryBuffer compress_memory_z(void const *const src, - const std::size_t srcLen); +MemoryBuffer compress_zlib(void const *const src, + const std::size_t srcLen); -//! Decompress an array with `compress_memory_z`. +//! Decompress an array with `compress_zlib`. //! //!\param src Compressed array. //!\param srcLen Size in bytes of the compressed array data //!\param dst Decompressed array. //!\param dstLen Size in bytes of the decompressed array. -void decompress_memory_z(void const *const src, const std::size_t srcLen, - unsigned char *const dst, const std::size_t dstLen); +void decompress_zlib(void const *const src, const std::size_t srcLen, + unsigned char *const dst, const std::size_t dstLen); //! Compress an array of quantized multilevel coefficients. //! diff --git a/src/cuda/LosslessCompression.cu b/src/cuda/LosslessCompression.cu index 072d6be598..3d32d660f7 100644 --- a/src/cuda/LosslessCompression.cu +++ b/src/cuda/LosslessCompression.cu @@ -90,7 +90,7 @@ unsigned char *compress_memory_huffman(long int *const src, free(out_data_miss); // const MemoryBuffer out_data = - // compress_memory_zstd(payload, total_size); + // compress_zstd(payload, total_size); const size_t cBuffSize = ZSTD_compressBound(total_size); unsigned char *const zstd_buffer = new unsigned char[cBuffSize]; @@ -148,7 +148,7 @@ void decompress_memory_huffman(unsigned char *const src, out_tree_size + out_data_hit_size / 8 + 4 + out_data_miss_size; unsigned char *huffman_encoding_p = (unsigned char *)malloc(total_huffman_size); - // decompress_memory_zstd(buf, srcLen - 3 * sizeof(size_t), + // decompress_zstd(buf, srcLen - 3 * sizeof(size_t), // huffman_encoding_p, // total_huffman_size); diff --git a/src/huffman.cpp b/src/huffman.cpp index 620da655bc..a9f9fbca1e 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -66,10 +66,10 @@ compress_serialized_huffman(const pb::Header &header, const MemoryBuffer &payload) { switch (header.encoding().compressor()) { case pb::Encoding::CPU_HUFFMAN_ZLIB: - return compress_memory_z(payload.data.get(), payload.size); + return compress_zlib(payload.data.get(), payload.size); case pb::Encoding::CPU_HUFFMAN_ZSTD: #ifdef MGARD_ZSTD - return compress_memory_zstd(payload.data.get(), payload.size); + return compress_zstd(payload.data.get(), payload.size); #else throw std::runtime_error("MGARD compiled without ZSTD support"); #endif @@ -146,11 +146,11 @@ HuffmanEncodedStream decompress_deserialize(const pb::Header &header, switch (header.encoding().compressor()) { case pb::Encoding::CPU_HUFFMAN_ZLIB: - decompress_memory_z(src_, srcLen_, dst_, dstLen_); + decompress_zlib(src_, srcLen_, dst_, dstLen_); break; case pb::Encoding::CPU_HUFFMAN_ZSTD: #ifdef MGARD_ZSTD - decompress_memory_zstd(src_, srcLen_, dst_, dstLen_); + decompress_zstd(src_, srcLen_, dst_, dstLen_); break; #else throw std::runtime_error("MGARD compiled without ZSTD support"); diff --git a/src/lossless_dispatcher.cpp b/src/lossless_dispatcher.cpp index 33dafb58b2..175e7495ba 100644 --- a/src/lossless_dispatcher.cpp +++ b/src/lossless_dispatcher.cpp @@ -93,8 +93,8 @@ MemoryBuffer compress_huffman_zstd_deprecated( namespace { -// `decompress_memory_z` and `decompress_memory_zstd` need to know the size of -// the decompressed buffer before they can decompress. So, in addition to the +// `decompress_zlib` and `decompress_zstd` need to know the size of the +// decompressed buffer before they can decompress. So, in addition to the // compressed serialized Huffman tree (`compressed`), we need to store the size // in bytes of the serialized Huffman tree (`nhuffman`). MemoryBuffer concatenate_nhuffman_and_compressed( @@ -124,7 +124,7 @@ compress_huffman_zlib_rfmh(const pb::Header &header, void const *const src, const MemoryBuffer encoded = compress_huffman_C_rfmh(header, src, srcLen); const MemoryBuffer compressed = - compress_memory_z(encoded.data.get(), encoded.size); + compress_zlib(encoded.data.get(), encoded.size); return concatenate_nhuffman_and_compressed(encoded.size, compressed); } @@ -138,7 +138,7 @@ compress_huffman_zstd_rfmh(const pb::Header &header, void const *const src, const MemoryBuffer encoded = compress_huffman_C_rfmh(header, src, srcLen); return concatenate_nhuffman_and_compressed( - encoded.size, compress_memory_zstd(encoded.data.get(), encoded.size)); + encoded.size, compress_zstd(encoded.data.get(), encoded.size)); } #endif @@ -181,10 +181,10 @@ MemoryBuffer compress(const pb::Header &header, const std::size_t srcLen) { switch (header.encoding().compressor()) { case pb::Encoding::CPU_ZLIB: - return compress_memory_z(src, srcLen); + return compress_zlib(src, srcLen); case pb::Encoding::CPU_ZSTD: #ifdef MGARD_ZSTD - return compress_memory_zstd(src, srcLen); + return compress_zstd(src, srcLen); #else throw std::runtime_error("MGARD compiled without ZSTD support"); #endif @@ -322,8 +322,8 @@ void decompress_huffman_zlib_rfmh(const pb::Header &header, BufferWindow window(src, srcLen); // Read theSsze in bytes of the serialized Huffman tree. MemoryBuffer encoded(read_header_size(window)); - decompress_memory_z(window.current, window.end - window.current, - encoded.data.get(), encoded.size); + decompress_zlib(window.current, window.end - window.current, + encoded.data.get(), encoded.size); return decompress_huffman_C_rfmh(header, encoded, dst, dstLen); } @@ -339,8 +339,8 @@ void decompress_huffman_zstd_rfmh(const pb::Header &header, BufferWindow window(src, srcLen); // Read the size in bytes of the serialized Huffman tree. MemoryBuffer encoded(read_header_size(window)); - decompress_memory_zstd(window.current, window.end - window.current, - encoded.data.get(), encoded.size); + decompress_zstd(window.current, window.end - window.current, + encoded.data.get(), encoded.size); return decompress_huffman_C_rfmh(header, encoded, dst, dstLen); } @@ -385,12 +385,12 @@ void decompress(const pb::Header &header, void const *const src, const std::size_t dstLen) { switch (header.encoding().compressor()) { case pb::Encoding::CPU_ZLIB: - return decompress_memory_z(src, srcLen, static_cast(dst), - dstLen); + return decompress_zlib(src, srcLen, static_cast(dst), + dstLen); case pb::Encoding::CPU_ZSTD: #ifdef MGARD_ZSTD - return decompress_memory_zstd( - src, srcLen, reinterpret_cast(dst), dstLen); + return decompress_zstd(src, srcLen, reinterpret_cast(dst), + dstLen); #else throw std::runtime_error("MGARD compiled without ZSTD support"); #endif diff --git a/src/lossless_zlib.cpp b/src/lossless_zlib.cpp index 9bb30c0b77..b41ac643ac 100644 --- a/src/lossless_zlib.cpp +++ b/src/lossless_zlib.cpp @@ -8,8 +8,8 @@ namespace mgard { -MemoryBuffer compress_memory_z(void const *const src, - const std::size_t srcLen) { +MemoryBuffer compress_zlib(void const *const src, + const std::size_t srcLen) { const std::size_t BUFSIZE = 2048 * 1024; std::vector buffers; std::vector bufferLengths; @@ -64,8 +64,8 @@ MemoryBuffer compress_memory_z(void const *const src, return MemoryBuffer(buffer, bufferLen); } -void decompress_memory_z(void const *const src, const std::size_t srcLen, - unsigned char *const dst, const std::size_t dstLen) { +void decompress_zlib(void const *const src, const std::size_t srcLen, + unsigned char *const dst, const std::size_t dstLen) { z_stream strm = {}; strm.total_in = strm.avail_in = srcLen; strm.total_out = strm.avail_out = dstLen; diff --git a/src/lossless_zstd.cpp b/src/lossless_zstd.cpp index 4b7fc4bf28..749c5794a4 100644 --- a/src/lossless_zstd.cpp +++ b/src/lossless_zstd.cpp @@ -35,8 +35,8 @@ namespace mgard { CHECK(!ZSTD_isError(err), "%s", ZSTD_getErrorName(err)); \ } while (0) -MemoryBuffer compress_memory_zstd(void const *const src, - const std::size_t srcLen) { +MemoryBuffer compress_zstd(void const *const src, + const std::size_t srcLen) { const std::size_t cBuffSize = ZSTD_compressBound(srcLen); unsigned char *const buffer = new unsigned char[cBuffSize]; const std::size_t cSize = ZSTD_compress(buffer, cBuffSize, src, srcLen, 1); @@ -44,9 +44,8 @@ MemoryBuffer compress_memory_zstd(void const *const src, return MemoryBuffer(buffer, cSize); } -void decompress_memory_zstd(void const *const src, const std::size_t srcLen, - unsigned char *const dst, - const std::size_t dstLen) { +void decompress_zstd(void const *const src, const std::size_t srcLen, + unsigned char *const dst, const std::size_t dstLen) { std::size_t const dSize = ZSTD_decompress(dst, dstLen, src, srcLen); CHECK_ZSTD(dSize); diff --git a/tests/src/lossless_regression.cpp b/tests/src/lossless_regression.cpp index 4320bee17a..11c04a4a01 100644 --- a/tests/src/lossless_regression.cpp +++ b/tests/src/lossless_regression.cpp @@ -32,10 +32,10 @@ MemoryBuffer compress_serialized(const pb::Header &header, switch (header.encoding().compressor()) { case pb::Encoding::CPU_HUFFMAN_ZLIB: - return compress_memory_z(p, n); + return compress_zlib(p, n); case pb::Encoding::CPU_HUFFMAN_ZSTD: #ifdef MGARD_ZSTD - return compress_memory_zstd(p, n); + return compress_zstd(p, n); #else throw std::runtime_error("MGARD compiled without ZSTD support"); #endif @@ -132,11 +132,11 @@ void decompress_memory_huffman(const pb::Header &header, switch (header.encoding().compressor()) { case pb::Encoding::CPU_HUFFMAN_ZLIB: - decompress_memory_z(src_, srcLen_, dst_, dstLen_); + decompress_zlib(src_, srcLen_, dst_, dstLen_); break; case pb::Encoding::CPU_HUFFMAN_ZSTD: #ifdef MGARD_ZSTD - decompress_memory_zstd(src_, srcLen_, dst_, dstLen_); + decompress_zstd(src_, srcLen_, dst_, dstLen_); break; #else throw std::runtime_error("MGARD compiled without ZSTD support"); diff --git a/tests/src/test_lossless.cpp b/tests/src/test_lossless.cpp index fa9198709f..bcb5b32bb6 100644 --- a/tests/src/test_lossless.cpp +++ b/tests/src/test_lossless.cpp @@ -189,10 +189,10 @@ void test_zstd_identity(std::uniform_int_distribution &dis, std::generate(src, src + n, f); unsigned char *const src_ = new unsigned char[n]; std::copy(src, src + n, src_); - mgard::MemoryBuffer dst = mgard::compress_memory_zstd(src_, n); + mgard::MemoryBuffer dst = mgard::compress_zstd(src_, n); delete[] src_; unsigned char *const decompressed = new unsigned char[n]; - mgard::decompress_memory_zstd(dst.data.get(), dst.size, decompressed, n); + mgard::decompress_zstd(dst.data.get(), dst.size, decompressed, n); REQUIRE(std::equal(src, src + n, decompressed)); delete[] decompressed; delete[] src; @@ -219,10 +219,10 @@ void test_zlib_identity(std::uniform_int_distribution &dis, std::generate(src, src + n, f); unsigned char *const src_ = new unsigned char[n]; std::copy(src, src + n, src_); - mgard::MemoryBuffer dst = mgard::compress_memory_z(src_, n); + mgard::MemoryBuffer dst = mgard::compress_zlib(src_, n); delete[] src_; unsigned char *const decompressed = new unsigned char[n]; - mgard::decompress_memory_z(dst.data.get(), dst.size, decompressed, n); + mgard::decompress_zlib(dst.data.get(), dst.size, decompressed, n); REQUIRE(std::equal(src, src + n, decompressed)); delete[] decompressed; delete[] src; From 48529f9d2155618955780281595c65be2f503243 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Mon, 20 Jun 2022 12:49:48 -0400 Subject: [PATCH 50/58] Change argument order in periodic data tests. --- tests/src/test_huffman.cpp | 32 +++++++++++++++++--------------- tests/src/test_lossless.cpp | 30 +++++++++++++++--------------- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp index a444d0c6c1..8874ddd168 100644 --- a/tests/src/test_huffman.cpp +++ b/tests/src/test_huffman.cpp @@ -74,8 +74,9 @@ void test_encoding_regression_constant(const std::size_t N, const long int q) { delete[] quantized; } -void test_encoding_regression_periodic(const std::size_t N, const long int q, - const std::size_t period) { +void test_encoding_regression_periodic(const std::size_t N, + const std::size_t period, + const long int q) { long int *const quantized = new long int[N]; std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); test_encoding_regression(quantized, N); @@ -99,8 +100,9 @@ void test_decoding_regression_constant(const std::size_t N, const long int q) { delete[] quantized; } -void test_decoding_regression_periodic(const std::size_t N, const long int q, - const std::size_t period) { +void test_decoding_regression_periodic(const std::size_t N, + const std::size_t period, + const long int q) { long int *const quantized = new long int[N]; std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); test_decoding_regression(quantized, N); @@ -126,8 +128,8 @@ void test_inversion_constant(const std::size_t N, const T q) { } template -void test_inversion_periodic(const std::size_t N, const T q, - const std::size_t period) { +void test_inversion_periodic(const std::size_t N, const std::size_t period, + const T q) { T *const quantized = new T[N]; std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); test_inversion(quantized, N); @@ -154,9 +156,9 @@ TEST_CASE("encoding regression", "[huffman] [regression]") { } SECTION("periodic data") { - test_encoding_regression_periodic(10, -3, 3); - test_encoding_regression_periodic(100, 0, 10); - test_encoding_regression_periodic(1000, 51, 17); + test_encoding_regression_periodic(10, 3, -3); + test_encoding_regression_periodic(100, 10, 0); + test_encoding_regression_periodic(1000, 17, 51); } SECTION("random data") { @@ -177,9 +179,9 @@ TEST_CASE("decoding regression", "[huffman] [regression]") { } SECTION("periodic data") { - test_decoding_regression_periodic(10, 12, 4); - test_decoding_regression_periodic(100, -71, 9); - test_decoding_regression_periodic(1000, 3280, 23); + test_decoding_regression_periodic(10, 4, 12); + test_decoding_regression_periodic(100, 9, -71); + test_decoding_regression_periodic(1000, 23, 3280); } SECTION("random data") { @@ -203,9 +205,9 @@ TEMPLATE_TEST_CASE("Huffman inversion", "[huffman]", std::int8_t, std::int16_t, } SECTION("periodic data") { - test_inversion_periodic(10, -dis(gen_), 11); - test_inversion_periodic(100, dis(gen_), 10); - test_inversion_periodic(1000, -dis(gen_), 9); + test_inversion_periodic(10, 3, -dis(gen_)); + test_inversion_periodic(100, 10, dis(gen_)); + test_inversion_periodic(1000, 9, -dis(gen_)); } SECTION("random data") { diff --git a/tests/src/test_lossless.cpp b/tests/src/test_lossless.cpp index bcb5b32bb6..b4e9c06661 100644 --- a/tests/src/test_lossless.cpp +++ b/tests/src/test_lossless.cpp @@ -90,8 +90,8 @@ void test_hcr_constant(const std::size_t srcLen, const long int q) { delete[] src; } -void test_hcr_periodic(const std::size_t srcLen, const long int initial, - const std::size_t period) { +void test_hcr_periodic(const std::size_t srcLen, const std::size_t period, + const long int initial) { long int *const src = new long int[srcLen]; std::generate(src, src + srcLen, PeriodicGenerator(period, initial)); test_huffman_compression_regression(src, srcLen); @@ -114,8 +114,8 @@ void test_hdr_constant(const std::size_t srcLen, const long int q) { delete[] src; } -void test_hdr_periodic(const std::size_t srcLen, const long int initial, - const std::size_t period) { +void test_hdr_periodic(const std::size_t srcLen, const std::size_t period, + const long int initial) { long int *const src = new long int[srcLen]; std::generate(src, src + srcLen, PeriodicGenerator(period, initial)); test_huffman_decompression_regression(src, srcLen); @@ -141,9 +141,9 @@ TEST_CASE("Huffman compression regression", "[compressors] [regression]") { } SECTION("periodic data") { - test_hcr_periodic(5, 0, 5); - test_hcr_periodic(25, -4, 6); - test_hcr_periodic(625, 22, 20); + test_hcr_periodic(5, 5, 0); + test_hcr_periodic(25, 6, -4); + test_hcr_periodic(625, 20, 22); } SECTION("random data") { @@ -164,9 +164,9 @@ TEST_CASE("Huffman decompression regression", "[compressors] [regression]") { } SECTION("periodic data") { - test_hdr_periodic(10, 0, 3); - test_hdr_periodic(100, -570, 10); - test_hdr_periodic(1000, 394, 19); + test_hdr_periodic(10, 3, 0); + test_hdr_periodic(100, 10, -570); + test_hdr_periodic(1000, 19, 394); } SECTION("random data") { @@ -270,8 +270,8 @@ void test_cd_inversion_constant(const mgard::pb::Header &header, template void test_cd_inversion_periodic(const mgard::pb::Header &header, - const std::size_t N, const Int q, - const std::size_t period) { + const std::size_t N, const std::size_t period, + const Int q) { Int *const quantized = new Int[N]; std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); test_cd_inversion(header, quantized, N); @@ -321,9 +321,9 @@ void test_cd_inversion_constant(const mgard::pb::Header &header) { template void test_cd_inversion_periodic(const mgard::pb::Header &header) { - test_cd_inversion_periodic(header, 100, -5, 3); - test_cd_inversion_periodic(header, 1000, 86, 60); - test_cd_inversion_periodic(header, 10000, 7, 62); + test_cd_inversion_periodic(header, 100, 3, -5); + test_cd_inversion_periodic(header, 1000, 60, 86); + test_cd_inversion_periodic(header, 10000, 62, 7); } template From 547d4e03e76ce44e38367023b0bcdc8846282ead Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Mon, 20 Jun 2022 12:59:23 -0400 Subject: [PATCH 51/58] Remove unused `NOOP_COMPRESSOR` decompressor. --- src/lossless_dispatcher.cpp | 12 ------------ src/mgard.proto | 2 ++ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/src/lossless_dispatcher.cpp b/src/lossless_dispatcher.cpp index 175e7495ba..eb6a71593a 100644 --- a/src/lossless_dispatcher.cpp +++ b/src/lossless_dispatcher.cpp @@ -201,18 +201,6 @@ MemoryBuffer compress(const pb::Header &header, } } -void decompress_noop(void const *const src, const std::size_t srcLen, - void *const dst, const std::size_t dstLen) { - if (srcLen != dstLen) { - throw std::invalid_argument("source and destination lengths must be equal"); - } - { - unsigned char const *const p = static_cast(src); - unsigned char *const q = static_cast(dst); - std::copy(p, p + srcLen, q); - } -} - namespace { template diff --git a/src/mgard.proto b/src/mgard.proto index 2407a78f7e..2b80d7de0a 100644 --- a/src/mgard.proto +++ b/src/mgard.proto @@ -124,6 +124,8 @@ message Encoding { SHUFFLE = 1; } enum Compressor { + // Not yet implemented. We'll want to add a message for quantized coefficients stored 'verbatim' + // (probably still somewhat compressed because of the varint encoding used for `int64`s). NOOP_COMPRESSOR = 0; // Explanation for the wonky numbering: this first case was originally called `CPU_HUFFMAN_ZLIB`, // but the relevant code didn't actually call the Huffman encoder. From 021e33000e42c2111eb96033b00dbad876205bcd Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 21 Jun 2022 12:05:40 -0400 Subject: [PATCH 52/58] Add `Chain` to allow iterator range concatenation. --- include/utilities.hpp | 87 ++++++++++++++++++++++++++++++++++++ include/utilities.tpp | 76 +++++++++++++++++++++++++++++++ tests/src/test_utilities.cpp | 55 +++++++++++++++++++++++ 3 files changed, 218 insertions(+) diff --git a/include/utilities.hpp b/include/utilities.hpp index 9f514ef472..43b005a0d4 100644 --- a/include/utilities.hpp +++ b/include/utilities.hpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace mgard { @@ -545,6 +546,92 @@ class Bits::iterator { unsigned char offset; }; +//! Concatenated iterator ranges. +//! +//! Approximate Python's `itertools.chain` generator. +template class Chain { +public: + //! Constructor. + //! + //!\param segments Beginnings and lengths of iterator ranges. + Chain(const std::vector> &segments); + + // Forward declaration. + class iterator; + + //! Return an iterator to the beginning of the enumeration. + iterator begin() const; + + //! Return an iterator to the end of the enumeration. + iterator end() const; + + //! Beginnings and lengths of iterator ranges. + std::vector> segments; +}; + +//! Equality comparison. +template bool operator==(const Chain &a, const Chain &b); + +//! Inequality comparison. +template bool operator!=(const Chain &a, const Chain &b); + +//! Iterator over concatenated iterator ranges. +template class Chain::iterator { +public: + //! Category of the iterator. + using iterator_category = std::forward_iterator_tag; + //! Type iterated over. + using value_type = typename std::iterator_traits::value_type; + //! Type for distance between iterators. + using difference_type = typename std::iterator_traits::difference_type; + //! Pointer to `value_type`. + using pointer = typename std::iterator_traits::pointer; + //! Type returned by the dereference operator. + using reference = typename std::iterator_traits::reference; + + //! Constructor. + //! + //!\param iterable Associated chain. + //!\param q Iterator to current segment. + iterator( + const Chain &iterable, + const typename std::vector>::const_iterator q); + + //! Equality comparison. + bool operator==(const iterator &other) const; + + //! Inequality comparison. + bool operator!=(const iterator &other) const; + + //! Preincrement. + iterator &operator++(); + + //! Postincrement. + iterator operator++(int); + + //! Dereference. + reference operator*() const; + +private: + //! Associated bit range. + const Chain &iterable; + + //! Iterator to current segment. + typename std::vector>::const_iterator q; + + //! Position in the current segment. + It p; + + //! Distance from the beginning of the current segment. + std::size_t i; + + //! Length of the current segment. + std::size_t n; + + //! Zero `i`; populate `p` and `n` from `q` if not at end. + void conditionally_start_segment(); +}; + } // namespace mgard #include "utilities.tpp" diff --git a/include/utilities.tpp b/include/utilities.tpp index 52fd9b4ba7..6260fa5af4 100644 --- a/include/utilities.tpp +++ b/include/utilities.tpp @@ -345,4 +345,80 @@ template MemoryBuffer::MemoryBuffer(const std::size_t size) : MemoryBuffer(new T[size], size) {} +template +Chain::Chain(const std::vector> &segments) + : segments(segments) {} + +template bool operator==(const Chain &a, const Chain &b) { + return a.segments == b.segments; +} + +template bool operator!=(const Chain &a, const Chain &b) { + return !operator==(a, b); +} + +template typename Chain::iterator Chain::begin() const { + return {*this, segments.begin()}; +} + +template typename Chain::iterator Chain::end() const { + return {*this, segments.end()}; +} + +template +Chain::iterator::iterator( + const Chain &iterable, + const typename std::vector>::const_iterator q) + : iterable(iterable), q(q) { + conditionally_start_segment(); +} + +template void Chain::iterator::conditionally_start_segment() { + i = 0; + if (q != iterable.segments.end()) { + const std::pair pair = *q; + p = pair.first; + n = pair.second; + if (not n) { + ++q; + conditionally_start_segment(); + } + } +} + +template +bool Chain::iterator:: +operator==(const typename Chain::iterator &other) const { + return i == other.i and q == other.q and iterable == other.iterable; +} + +template +bool Chain::iterator:: +operator!=(const typename Chain::iterator &other) const { + return !operator==(other); +} + +template +typename Chain::iterator &Chain::iterator::operator++() { + ++p; + ++i; + if (i == n) { + ++q; + conditionally_start_segment(); + } + return *this; +} + +template +typename Chain::iterator Chain::iterator::operator++(int) { + const iterator tmp = *this; + operator++(); + return tmp; +} + +template +typename Chain::iterator::reference Chain::iterator::operator*() const { + return *p; +} + } // namespace mgard diff --git a/tests/src/test_utilities.cpp b/tests/src/test_utilities.cpp index 1e53eec72e..e5d66656e8 100644 --- a/tests/src/test_utilities.cpp +++ b/tests/src/test_utilities.cpp @@ -253,3 +253,58 @@ TEST_CASE("Bits iteration", "[utilities]") { } } } + +TEST_CASE("Chain iteration", "[utilities]") { + SECTION("reading") { + const std::size_t N = 5; + std::array, N> in; + in.at(0) = {0}; + in.at(1) = {1, 2, 3}; + in.at(2) = {}; + in.at(3) = {4, 5, 6}; + in.at(4) = {7, 8, 9, 10}; + using It = std::vector::const_iterator; + std::vector> segments; + for (const std::vector &in_ : in) { + segments.push_back({in_.begin(), in_.size()}); + } + unsigned char expected = 0; + TrialTracker tracker; + for (const unsigned char read : mgard::Chain(segments)) { + tracker += read == expected++; + } + REQUIRE(tracker); + REQUIRE(expected == 11); + } + + SECTION("writing") { + const std::size_t N = 4; + std::array, N> out; + const std::array ns{3, 5, 0, 1}; + using It = std::vector::iterator; + std::vector> segments; + segments.reserve(N); + for (std::size_t i = 0; i < N; ++i) { + std::vector &out_ = out.at(i); + const std::size_t n = ns.at(i); + out_.resize(n); + segments.push_back({out_.begin(), n}); + } + + unsigned short int a = 1; + unsigned short int b = 1; + for (unsigned short int &c : mgard::Chain(segments)) { + c = a; + const unsigned short int tmp = a + b; + a = b; + b = tmp; + } + + std::array, N> expected; + expected.at(0) = {1, 1, 2}; + expected.at(1) = {3, 5, 8, 13, 21}; + expected.at(2) = {}; + expected.at(3) = {34}; + REQUIRE(out == expected); + } +} From 1d07149c0d79659cd485b7d75e84024d4966022b Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 28 Jun 2022 16:11:32 -0400 Subject: [PATCH 53/58] Limit sizes of frequency and 'missed' subtables. --- include/huffman.tpp | 285 ++++++++++++++++++++++++++++++++++++++++---- src/mgard.proto | 16 ++- 2 files changed, 273 insertions(+), 28 deletions(-) diff --git a/include/huffman.tpp b/include/huffman.tpp index 904c23e24f..cf5706e9b1 100644 --- a/include/huffman.tpp +++ b/include/huffman.tpp @@ -22,6 +22,7 @@ using Endpoints = google::protobuf::RepeatedField; using Missed = google::protobuf::RepeatedField; using Frequencies = google::protobuf::Map; +using SubtableSizes = google::protobuf::RepeatedField; } // namespace @@ -176,6 +177,221 @@ void HuffmanCode::recursively_set_codewords( } } +namespace { + +//! Maximum number of elements per frequency/missed subtable. +inline constexpr std::size_t SUBTABLE_MAX_SIZE = 1 << 20; + +//! A logical table split into one or more subtables of moderate size. +//! +//! The logical table can be read by chaining the subtables. +template struct Supertable { + // The beginning and size of a subtable. + using Segment = std::pair; + + //! Constructor. + //! + //! Construct an 'empty' `Supertable`. The data members will be given the + //! right sizes, but for the most part they will not populated. That is left + //! to derived class constructors or callers. + //! + //!\param nelements Total number of subtable entries. + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). This field will be written to. + Supertable(const std::size_t nelements, SubtableSizes &nbytes_subtables) + : nsubtables((nelements + SUBTABLE_MAX_SIZE - 1) / SUBTABLE_MAX_SIZE), + subtables(nsubtables), segments(nsubtables), + nbytes_subtables(nbytes_subtables) { + nbytes_subtables.Resize(nsubtables, 0); + + for (std::size_t i = 0; i + 1 < nsubtables; ++i) { + segments.at(i).second = SUBTABLE_MAX_SIZE; + } + if (nsubtables) { + // If `nelements` is an exact multiple of `SUBTABLE_MAX_SIZE` and not + // zero, we need this last size to be `SUBTABLE_MAX_SIZE`, not `0`. If + // `nelements` is zero, we won't be executing this statement. + segments.back().second = nelements % SUBTABLE_MAX_SIZE + ? nelements % SUBTABLE_MAX_SIZE + : SUBTABLE_MAX_SIZE; + } + } + + //! Constructor. + //! + //! Construct a `Supertable` from a collection of parsed messages. This + //! constructor leaves `segments` uninitialized. This is because `Supertable` + //! doesn't know which field of `Message` is the subtable. + //! + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). + //!\param window Window into buffer containing messages to be parsed. + Supertable(SubtableSizes &nbytes_subtables, BufferWindow &window) + : nsubtables(nbytes_subtables.size()), subtables(nsubtables), + segments(nsubtables), nbytes_subtables(nbytes_subtables) { + for (std::size_t i = 0; i < nsubtables; ++i) { + subtables.at(i) = read_message(window, nbytes_subtables.Get(i)); + } + } + + //! Calculate and store the sizes in bytes of the subtables. + //! + //! This function should be called once the subtables are populated. + void calculate_nbytes_subtables() { + for (std::size_t i = 0; i < nsubtables; ++i) { + nbytes_subtables.Set(i, subtables.at(i).ByteSize()); + } + } + + //! Calculate the total size in bytes of the subtables. + //! + //! This function assumes no changes have been made to the subtables since the + //! last call to `calculate_nbytes_subtables`. + std::size_t ByteSize() const { + return std::accumulate(nbytes_subtables.begin(), nbytes_subtables.end(), + static_cast(0)); + } + + void SerializeToArray(void *const p, const std::size_t n) const { + unsigned char *const p_ = reinterpret_cast(p); + std::size_t total = 0; + for (std::size_t i = 0; i < nsubtables; ++i) { + const Message &subtable = subtables.at(i); + const google::protobuf::uint64 nbytes_subtable = nbytes_subtables.Get(i); + + subtable.SerializeToArray(p_ + total, nbytes_subtable); + total += nbytes_subtable; + } + if (total != n) { + throw std::invalid_argument("serialization buffer size incorrect"); + } + } + + //! Number of subtables. + std::size_t nsubtables; + + //! Subtables. + //! + //! It might be better to name this member 'messages.' Elsewhere we use + //! 'subtable' to refer to the fields of the messages containing the + //! supertable elements. Using that vocabulary, a `pb::FrequencySubtable` + //! would be a message while its `frequencies` field would be the subtable. + std::vector subtables; + + //! Segments for a concatenated subtable chain. + //! + //! A `Chain::iterator>` can be constructed from this. + std::vector segments; + + //! Sizes in bytes of the subtables. + SubtableSizes &nbytes_subtables; +}; + +//! A logical frequency table split into one or more subtables of moderate size. +struct FrequencySupertable + : Supertable { + //! Constructor. + //! + //! Construct and populate a `FrequencySupertable` from a vector of symbol + //! frequencies. + //! + //!\param frequencies Symbol frequencies to store in the subtables. + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). This field will be written to. + FrequencySupertable(const std::vector &frequencies, + SubtableSizes &nbytes_subtables) + : Supertable(std::count_if(frequencies.begin(), frequencies.end(), + [](const std::size_t frequency) -> bool { + return frequency; + }), + nbytes_subtables) { + // `i` is the index of the subtable we're inserting into. (Technically + // we're inserting into the subtable's frequency map field rather than + // the subtable itself.) `j` is the number of entries we've inserted + // into subtable `i`. `k` is the index in the vector of frequencies + // passed to the constructor. + std::size_t k = 0; + for (std::size_t i = 0; i < nsubtables; ++i) { + Frequencies &frequencies_ = *subtables.at(i).mutable_frequencies(); + Segment &segment = segments.at(i); + // How big `frequencies_` should be when we're done. + const std::size_t nfrequencies_ = segment.second; + for (std::size_t j = 0; j < nfrequencies_; ++k) { + const std::size_t frequency = frequencies.at(k); + if (frequency) { + frequencies_.insert({k, frequency}); + ++j; + } + } + segment.first = frequencies_.begin(); + } + + calculate_nbytes_subtables(); + } + + //! Constructor. + //! + //! Construct a `FrequencySubtable` from a collection of parsed messages. + //! + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). + //!\param window Window into buffer containing messages to be parsed. + FrequencySupertable(SubtableSizes &nbytes_subtables, BufferWindow &window) + : Supertable(nbytes_subtables, window) { + for (std::size_t i = 0; i < nsubtables; ++i) { + Segment &segment = segments.at(i); + Frequencies &frequencies = *subtables.at(i).mutable_frequencies(); + + segment.first = frequencies.begin(); + segment.second = frequencies.size(); + } + } +}; + +//! A logical 'missed' table split into one or more subtables of moderate size. +struct MissedSupertable : Supertable { + //! Constructor. + //! + //! Construct an 'empty' `MissedSupertable`. It is expected that the caller + //! will subsequently write to the subtables using `Chain`. + //! + //!\param nmissed Number of missed symbols. + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). This field will be written to. + MissedSupertable(const std::size_t nmissed, SubtableSizes &nbytes_subtables) + : Supertable(nmissed, nbytes_subtables) { + for (std::size_t i = 0; i < nsubtables; ++i) { + Missed &missed = *subtables.at(i).mutable_missed(); + Segment &segment = segments.at(i); + // How big `missed` should be when we're done. + const std::size_t nmissed = segment.second; + + missed.Resize(nmissed, 0); + segment.first = missed.begin(); + } + } + + //! Constructor. + //! + //! Construct a `MissedSubtable` from a collection of parsed messages. + //! + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). + //!\param window Window into buffer containing messages to be parsed. + MissedSupertable(SubtableSizes &nbytes_subtables, BufferWindow &window) + : Supertable(nbytes_subtables, window) { + for (std::size_t i = 0; i < nsubtables; ++i) { + Segment &segment = segments.at(i); + Missed &missed = *subtables.at(i).mutable_missed(); + + segment.first = missed.begin(); + segment.second = missed.size(); + } + } +}; + +} // namespace + template MemoryBuffer huffman_encode(Symbol const *const begin, const std::size_t n) { @@ -188,7 +404,7 @@ MemoryBuffer huffman_encode(Symbol const *const begin, const std::size_t nbits = std::inner_product(code.frequencies.begin(), code.frequencies.end(), lengths.begin(), static_cast(0)); - const std::size_t nbytes = (nbits + CHAR_BIT - 1) / CHAR_BIT; + const std::size_t nbytes_hit = (nbits + CHAR_BIT - 1) / CHAR_BIT; pb::HuffmanHeader header; header.set_index_mapping(pb::HuffmanHeader::INCLUSIVE_RANGE); @@ -200,23 +416,18 @@ MemoryBuffer huffman_encode(Symbol const *const begin, header.add_endpoints(code.endpoints.second); header.set_nbits(nbits); - Frequencies &frequencies = *header.mutable_frequencies(); - { - std::size_t i = 0; - for (const std::size_t frequency : code.frequencies) { - if (frequency) { - frequencies.insert({i, frequency}); - } - ++i; - } - } + FrequencySupertable frequency_supertable( + code.frequencies, *header.mutable_nbytes_frequency_subtables()); + MissedSupertable missed_supertable(code.nmissed(), + *header.mutable_nbytes_missed_subtables()); - Missed &missed_ = *header.mutable_missed(); - missed_.Resize(code.nmissed(), 0); - Missed::iterator missed = missed_.begin(); + Chain chained_missed_supertable(missed_supertable.segments); + Chain::iterator missed = chained_missed_supertable.begin(); + // Now we're ready to populate the 'missed' subtables in the course of + // populating the 'hit' buffer. // Zero-initialize the bytes. - unsigned char *const hit_ = new unsigned char[nbytes](); + unsigned char *const hit_ = new unsigned char[nbytes_hit](); unsigned char *hit = hit_; unsigned char offset = 0; @@ -249,8 +460,18 @@ MemoryBuffer huffman_encode(Symbol const *const begin, } } + // We're done writing to the 'missed' subtables, so we can now calculate their + // serialized sizes. We need to do this before calling + // `missed_supertable.ByteSize`. + missed_supertable.calculate_nbytes_subtables(); + const std::uint_least64_t nheader = header.ByteSize(); - MemoryBuffer out(HEADER_SIZE_SIZE + nheader + nbytes); + const std::size_t nbytes_frequency_supertable = + frequency_supertable.ByteSize(); + const std::size_t nbytes_missed_supertable = missed_supertable.ByteSize(); + MemoryBuffer out(HEADER_SIZE_SIZE + nheader + + nbytes_frequency_supertable + + nbytes_missed_supertable + nbytes_hit); { unsigned char *p = out.data.get(); const std::array nheader_ = @@ -261,8 +482,14 @@ MemoryBuffer huffman_encode(Symbol const *const begin, header.SerializeToArray(p, nheader); p += nheader; - std::copy(hit_, hit_ + nbytes, p); - p += nbytes; + frequency_supertable.SerializeToArray(p, nbytes_frequency_supertable); + p += nbytes_frequency_supertable; + + missed_supertable.SerializeToArray(p, nbytes_missed_supertable); + p += nbytes_missed_supertable; + + std::copy(hit_, hit_ + nbytes_hit, p); + p += nbytes_hit; } delete[] hit_; @@ -283,19 +510,24 @@ MemoryBuffer huffman_decode(const MemoryBuffer &buffer) { if (endpoints_.size() != 2) { throw std::runtime_error("received an unexpected number of endpoints"); } - const std::pair endpoints(endpoints_.Get(0), - endpoints_.Get(1)); + const std::pair endpoints(endpoints_.Get(0), + endpoints_.Get(1)); if (header.codeword_mapping() != pb::HuffmanHeader::INDEX_FREQUENCY_PAIRS) { throw std::runtime_error("unrecognized Huffman codeword mapping"); } - const Frequencies &frequencies_ = header.frequencies(); + FrequencySupertable frequency_supertable( + *header.mutable_nbytes_frequency_subtables(), window); + Chain chained_frequency_supertable( + frequency_supertable.segments); if (header.missed_encoding() != pb::HuffmanHeader::LITERAL) { throw std::runtime_error("unrecognized Huffman missed buffer encoding"); } - const Missed &missed_ = header.missed(); - Missed::const_iterator missed = missed_.cbegin(); + MissedSupertable missed_supertable(*header.mutable_nbytes_missed_subtables(), + window); + Chain chained_missed_supertable(missed_supertable.segments); + Chain::iterator missed = chained_missed_supertable.begin(); if (header.hit_encoding() != pb::HuffmanHeader::RUN_TOGETHER) { throw std::runtime_error("unrecognized Huffman hit buffer encoding"); @@ -308,8 +540,9 @@ MemoryBuffer huffman_decode(const MemoryBuffer &buffer) { "number of bytes in hit buffer"); } - const HuffmanCode code(endpoints, frequencies_.begin(), - frequencies_.end()); + const HuffmanCode code(endpoints, + chained_frequency_supertable.begin(), + chained_frequency_supertable.end()); // TODO: Maybe add a member function for this. const std::size_t nout = std::accumulate(code.frequencies.begin(), code.frequencies.end(), @@ -332,7 +565,7 @@ MemoryBuffer huffman_decode(const MemoryBuffer &buffer) { *q++ = decoded.first ? decoded.second : *missed++; } assert(nbits_read == nbits); - assert(missed == missed_.cend()); + assert(missed == chained_missed_supertable.end()); return out; } diff --git a/src/mgard.proto b/src/mgard.proto index 2b80d7de0a..a8353f1dec 100644 --- a/src/mgard.proto +++ b/src/mgard.proto @@ -189,12 +189,24 @@ message HuffmanHeader { // Minimum and maximum symbols eligible for codewords. repeated sint64 endpoints = 5; + // Sizes in bytes of serialized `FrequencySubtable`s to followw. + repeated uint64 nbytes_frequency_subtables = 6; + // Sizes in bytes of serialized `MissedSubtable`s to follow. + repeated uint64 nbytes_missed_subtables = 7; + // Size in bits of the hit buffer to follow. + uint64 nbits = 8; +} + +// One or more of these will follow a `HuffmanHeader`. +message FrequencySubtable { // Index–frequency pairs for frequency table. map frequencies = 6; +} + +// One or more of these will follow the `FrequencySubtable`s after a `HuffmanHeader`. +message MissedSubtable { // Encountered symbols that were not assigned codewords. repeated sint64 missed = 7; - // Size of the hit buffer in bits. - uint64 nbits = 8; } message Device { From f887c7d61e7910accef8311067c0fc10c493bb7b Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 30 Jun 2022 11:15:17 -0400 Subject: [PATCH 54/58] Add comments motivating `Supertable`, `Chain` use. --- include/huffman.tpp | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/huffman.tpp b/include/huffman.tpp index cf5706e9b1..64e506051b 100644 --- a/include/huffman.tpp +++ b/include/huffman.tpp @@ -237,6 +237,8 @@ template struct Supertable { //! Calculate and store the sizes in bytes of the subtables. //! //! This function should be called once the subtables are populated. + //! `nbytes_subtables` (a field in some `pb::HuffmanHeader`) will be modified. + //! Subsequent changes to the subtables will invalidate the sizes. void calculate_nbytes_subtables() { for (std::size_t i = 0; i < nsubtables; ++i) { nbytes_subtables.Set(i, subtables.at(i).ByteSize()); @@ -252,6 +254,10 @@ template struct Supertable { static_cast(0)); } + //! Write the subtables out to a buffer. + //! + //!\param p Buffer to which to serialize the subtables. + //!\param n Expected number of bytes that will be written. void SerializeToArray(void *const p, const std::size_t n) const { unsigned char *const p_ = reinterpret_cast(p); std::size_t total = 0; @@ -416,11 +422,28 @@ MemoryBuffer huffman_encode(Symbol const *const begin, header.add_endpoints(code.endpoints.second); header.set_nbits(nbits); + // Originally, `pb::HuffmanHeader` had a field each for the frequency and + // 'missed' tables. Unfortunately, these tables can get very big. In + // particular, if the error tolerance is very low, the quantized coefficients + // will be very large, and many of them will be missed. This could result in + // the size of the 'missed' table exceeding the (default) limit imposed by + // `google::protobuf::CodedInputStream`. See . As a workaround, we are splitting the + // 'missed' table (and, for good measure, the frequency table, too) into a + // sequence of subtables of moderate size. + + // This `FrequencySupertable` creates and populates the frequency subtables. FrequencySupertable frequency_supertable( code.frequencies, *header.mutable_nbytes_frequency_subtables()); + // This `MissedSupertable` creates but does not populate the 'missed' + // subtables. We'll populate the subtables below, as we encode the stream. MissedSupertable missed_supertable(code.nmissed(), *header.mutable_nbytes_missed_subtables()); + // This `Chain` lets us treat the 'missed' subtables as a single logical + // table. It frees us from manually keeping track of when we need to advance + // from one subtable to the next. Chain chained_missed_supertable(missed_supertable.segments); Chain::iterator missed = chained_missed_supertable.begin(); // Now we're ready to populate the 'missed' subtables in the course of @@ -516,6 +539,8 @@ MemoryBuffer huffman_decode(const MemoryBuffer &buffer) { if (header.codeword_mapping() != pb::HuffmanHeader::INDEX_FREQUENCY_PAIRS) { throw std::runtime_error("unrecognized Huffman codeword mapping"); } + // See the comments in `huffman_encode` for an explanation of why we use these + // `Supertable`s and `Chain`s. FrequencySupertable frequency_supertable( *header.mutable_nbytes_frequency_subtables(), window); Chain chained_frequency_supertable( From 6e281a6646a19e87cdd4382c773cda3d8b14edbf Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Thu, 30 Jun 2022 11:45:33 -0400 Subject: [PATCH 55/58] Add member functions for common size computations. --- include/huffman.hpp | 12 +++++++++--- include/huffman.tpp | 30 +++++++++++++++++------------- src/huffman.cpp | 8 +------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/include/huffman.hpp b/include/huffman.hpp index 7233f6e6a7..a943256d45 100644 --- a/include/huffman.hpp +++ b/include/huffman.hpp @@ -166,11 +166,11 @@ template class HuffmanCode { HuffmanCode(const std::pair &endpoints, const It begin, const It end); - //! Smallest and largest symbols (inclusive) to receive codewords. + //! Smallest and largest symbols (inclusive) eligible for codewords. std::pair endpoints; - //! Number of symbols that will be assigned codewords (including one for the - //! 'missed' symbol). + //! Number of symbols eligible for codewords (including one for the 'missed' + //! symbol). std::size_t ncodewords; //! Frequencies of the symbols in the input stream. @@ -179,10 +179,16 @@ template class HuffmanCode { //! Codewords associated to the symbols. std::vector codewords; + //! Report the number of symbols in the stream. + std::size_t nsymbols() const; + //! Report the number of out-of-range symbols encountered in the stream or //! given in the frequency table pairs. std::size_t nmissed() const; + //! Report the size in bits of the encoded stream. + std::size_t nbits_hit() const; + //! Check whether a symbol is eligible for a codeword. bool out_of_range(const Symbol symbol) const; diff --git a/include/huffman.tpp b/include/huffman.tpp index 64e506051b..29239ac0fc 100644 --- a/include/huffman.tpp +++ b/include/huffman.tpp @@ -149,10 +149,23 @@ HuffmanCode::HuffmanCode(const std::pair &endpoints, recursively_set_codewords(queue.top(), {}); } +template std::size_t HuffmanCode::nsymbols() const { + return std::accumulate(frequencies.begin(), frequencies.end(), + static_cast(0)); +} + template std::size_t HuffmanCode::nmissed() const { return frequencies.at(0); } +template std::size_t HuffmanCode::nbits_hit() const { + std::size_t nbits = 0; + for (std::size_t i = 0; i < ncodewords; ++i) { + nbits += frequencies.at(i) * codewords.at(i).length; + } + return nbits; +} + template bool HuffmanCode::out_of_range(const Symbol symbol) const { return symbol < endpoints.first or symbol > endpoints.second; @@ -403,13 +416,7 @@ MemoryBuffer huffman_encode(Symbol const *const begin, const std::size_t n) { const HuffmanCode code(begin, begin + n); - std::vector lengths; - for (const HuffmanCodeword &codeword : code.codewords) { - lengths.push_back(codeword.length); - } - const std::size_t nbits = - std::inner_product(code.frequencies.begin(), code.frequencies.end(), - lengths.begin(), static_cast(0)); + const std::size_t nbits = code.nbits_hit(); const std::size_t nbytes_hit = (nbits + CHAR_BIT - 1) / CHAR_BIT; pb::HuffmanHeader header; @@ -568,11 +575,8 @@ MemoryBuffer huffman_decode(const MemoryBuffer &buffer) { const HuffmanCode code(endpoints, chained_frequency_supertable.begin(), chained_frequency_supertable.end()); - // TODO: Maybe add a member function for this. - const std::size_t nout = - std::accumulate(code.frequencies.begin(), code.frequencies.end(), - static_cast(0)); - MemoryBuffer out(nout); + const std::size_t nsymbols = code.nsymbols(); + MemoryBuffer out(nsymbols); Symbol *q = out.data.get(); const Bits bits(window.current, window.current + nbits / CHAR_BIT, @@ -581,7 +585,7 @@ MemoryBuffer huffman_decode(const MemoryBuffer &buffer) { const typename HuffmanCode::Node root = code.queue.top(); assert(root); Bits::iterator b = bits.begin(); - for (std::size_t i = 0; i < nout; ++i) { + for (std::size_t i = 0; i < nsymbols; ++i) { typename HuffmanCode::Node node; for (node = root; node->left; node = *b++ ? node->right : node->left, ++nbits_read) diff --git a/src/huffman.cpp b/src/huffman.cpp index a9f9fbca1e..2f771c3969 100644 --- a/src/huffman.cpp +++ b/src/huffman.cpp @@ -248,13 +248,7 @@ HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, const HuffmanCode code(nql_endpoints, quantized_data, quantized_data + n); - std::vector lengths; - for (const HuffmanCodeword &codeword : code.codewords) { - lengths.push_back(codeword.length); - } - const std::size_t nbits = - std::inner_product(code.frequencies.begin(), code.frequencies.end(), - lengths.begin(), static_cast(0)); + const std::size_t nbits = code.nbits_hit(); const std::size_t nnz = code.ncodewords - std::count(code.frequencies.begin(), code.frequencies.end(), 0); From aeb89a97641f9c3580ddea882c4bb47dd2f892fc Mon Sep 17 00:00:00 2001 From: Jieyang Chen Date: Thu, 14 Jul 2022 16:33:56 -0400 Subject: [PATCH 56/58] Fix CPU lossless in MGARD-X --- .../mgard-x/CompressionHighLevel/Metadata.hpp | 20 +- include/mgard-x/Lossless/CPU.hpp | 215 ++---------------- include/utilities.tpp | 4 + 3 files changed, 47 insertions(+), 192 deletions(-) diff --git a/include/mgard-x/CompressionHighLevel/Metadata.hpp b/include/mgard-x/CompressionHighLevel/Metadata.hpp index a0111629ce..38519d6f9e 100644 --- a/include/mgard-x/CompressionHighLevel/Metadata.hpp +++ b/include/mgard-x/CompressionHighLevel/Metadata.hpp @@ -469,7 +469,15 @@ template struct Metadata { mgard::pb::Quantization &quantization = *header.mutable_quantization(); quantization.set_method(mgard::pb::Quantization::COEFFICIENTWISE_LINEAR); quantization.set_bin_widths(mgard::pb::Quantization::PER_COEFFICIENT); - quantization.set_type(mgard::pb::Quantization::INT64_T); + if (std::is_same::value) { + quantization.set_type(mgard::pb::Quantization::INT8_T); + } else if (std::is_same::value) { + quantization.set_type(mgard::pb::Quantization::INT16_T); + } else if (std::is_same::value) { + quantization.set_type(mgard::pb::Quantization::INT32_T); + } else if (std::is_same::value) { + quantization.set_type(mgard::pb::Quantization::INT64_T); + } quantization.set_big_endian(big_endian()); if (big_endian()) { etype = endiness_type::Big_Endian; @@ -710,7 +718,15 @@ template struct Metadata { mgard::pb::Quantization::COEFFICIENTWISE_LINEAR); assert(quantization.bin_widths() == mgard::pb::Quantization::PER_COEFFICIENT); - assert(quantization.type() == mgard::pb::Quantization::INT64_T); + if (std::is_same::value) { + assert(quantization.type() == mgard::pb::Quantization::INT8_T); + } else if (std::is_same::value) { + assert(quantization.type() == mgard::pb::Quantization::INT16_T); + } else if (std::is_same::value) { + assert(quantization.type() == mgard::pb::Quantization::INT32_T); + } else if (std::is_same::value) { + assert(quantization.type() == mgard::pb::Quantization::INT64_T); + } assert(quantization.big_endian() == big_endian()); if (big_endian()) { etype = endiness_type::Big_Endian; diff --git a/include/mgard-x/Lossless/CPU.hpp b/include/mgard-x/Lossless/CPU.hpp index 9171f87678..2583e155db 100644 --- a/include/mgard-x/Lossless/CPU.hpp +++ b/include/mgard-x/Lossless/CPU.hpp @@ -1,181 +1,33 @@ #ifndef MGARD_X_CPU_LOSSLESS_TEMPLATE_HPP #define MGARD_X_CPU_LOSSLESS_TEMPLATE_HPP -#include - -/*! CHECK - * Check that the condition holds. If it doesn't print a message and die. - */ -#define CHECK(cond, ...) \ - do { \ - if (!(cond)) { \ - fprintf(stderr, "%s:%d CHECK(%s) failed: ", __FILE__, __LINE__, #cond); \ - fprintf(stderr, "" __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - exit(1); \ - } \ - } while (0) - -/*! CHECK_ZSTD - * Check the zstd error code and die if an error occurred after printing a - * message. - */ -/*! CHECK_ZSTD - * Check the zstd error code and die if an error occurred after printing a - * message. - */ -#define CHECK_ZSTD(fn, ...) \ - do { \ - size_t const err = (fn); \ - CHECK(!ZSTD_isError(err), "%s", ZSTD_getErrorName(err)); \ - } while (0) - -namespace mgard { -void huffman_encoding(long int *quantized_data, const std::size_t n, - unsigned char **out_data_hit, size_t *out_data_hit_size, - unsigned char **out_data_miss, size_t *out_data_miss_size, - unsigned char **out_tree, size_t *out_tree_size); -void huffman_decoding(long int *quantized_data, - const std::size_t quantized_data_size, - unsigned char *out_data_hit, size_t out_data_hit_size, - unsigned char *out_data_miss, size_t out_data_miss_size, - unsigned char *out_tree, size_t out_tree_size); -} // namespace mgard +#include "proto/mgard.pb.h" +#include namespace mgard_x { -template -unsigned char *compress_memory_huffman(long int *const src, - const std::size_t srcLen, - std::size_t &outsize) { - unsigned char *out_data_hit = 0; - size_t out_data_hit_size; - unsigned char *out_data_miss = 0; - size_t out_data_miss_size; - unsigned char *out_tree = 0; - size_t out_tree_size; - ::mgard::huffman_encoding(src, srcLen, &out_data_hit, &out_data_hit_size, - &out_data_miss, &out_data_miss_size, &out_tree, - &out_tree_size); - - const size_t total_size = - out_data_hit_size / 8 + 4 + out_data_miss_size + out_tree_size; - unsigned char *payload = (unsigned char *)malloc(total_size); - unsigned char *bufp = payload; - - if (out_tree_size) { - std::memcpy(bufp, out_tree, out_tree_size); - bufp += out_tree_size; +template mgard::pb::Header setup_header() { + mgard::pb::Header header; + mgard::pb::Quantization &q = *header.mutable_quantization(); + if (std::is_same::value) { + q.set_type(mgard::pb::Quantization::INT8_T); + } else if (std::is_same::value) { + q.set_type(mgard::pb::Quantization::INT16_T); + } else if (std::is_same::value) { + q.set_type(mgard::pb::Quantization::INT32_T); + } else if (std::is_same::value) { + q.set_type(mgard::pb::Quantization::INT64_T); } - - std::memcpy(bufp, out_data_hit, out_data_hit_size / 8 + 4); - bufp += out_data_hit_size / 8 + 4; - - if (out_data_miss_size) { - std::memcpy(bufp, out_data_miss, out_data_miss_size); - bufp += out_data_miss_size; - } - - free(out_tree); - free(out_data_hit); - free(out_data_miss); - - const size_t cBuffSize = ZSTD_compressBound(total_size); - unsigned char *const zstd_buffer = new unsigned char[cBuffSize]; - - const std::size_t cSize = - ZSTD_compress(zstd_buffer, cBuffSize, payload, total_size, 1); - CHECK_ZSTD(cSize); - - free(payload); - payload = 0; - - const std::size_t bufferLen = 3 * sizeof(size_t) + cSize; - unsigned char *const buffer = new unsigned char[bufferLen]; - outsize = bufferLen; - - bufp = buffer; - *(size_t *)bufp = out_tree_size; - bufp += sizeof(size_t); - - *(size_t *)bufp = out_data_hit_size; - bufp += sizeof(size_t); - - *(size_t *)bufp = out_data_miss_size; - bufp += sizeof(size_t); - - { - unsigned char const *const p = zstd_buffer; - std::copy(p, p + cSize, bufp); - } - - { - unsigned char *buf = buffer; - out_tree_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_hit_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_miss_size = *(size_t *)buf; - buf += sizeof(size_t); - } - - return buffer; -} - -template -void decompress_memory_huffman(unsigned char *const src, - const std::size_t srcLen, long int *const dst, - const std::size_t dstLen) { - unsigned char *out_data_hit = 0; - size_t out_data_hit_size; - unsigned char *out_data_miss = 0; - size_t out_data_miss_size; - unsigned char *out_tree = 0; - size_t out_tree_size; - - unsigned char *buf = src; - - out_tree_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_hit_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_miss_size = *(size_t *)buf; - buf += sizeof(size_t); - - size_t total_huffman_size = - out_tree_size + out_data_hit_size / 8 + 4 + out_data_miss_size; - unsigned char *huffman_encoding_p = - (unsigned char *)malloc(total_huffman_size); - - size_t const dSize = ZSTD_decompress(huffman_encoding_p, total_huffman_size, - buf, srcLen - 3 * sizeof(size_t)); - CHECK_ZSTD(dSize); - - /* When zstd knows the content size, it will error if it doesn't match. */ - CHECK(total_huffman_size == dSize, - "Impossible because zstd will check this condition!"); - - out_tree = huffman_encoding_p; - out_data_hit = huffman_encoding_p + out_tree_size; - out_data_miss = - huffman_encoding_p + out_tree_size + out_data_hit_size / 8 + 4; - - mgard::huffman_decoding(dst, dstLen, out_data_hit, out_data_hit_size, - out_data_miss, out_data_miss_size, out_tree, - out_tree_size); - - free(huffman_encoding_p); + mgard::pb::Encoding &e = *header.mutable_encoding(); + // MGARD-X requires Zstd, so we always use CPU_HUFFMAN_ZSTD + e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); + e.set_serialization(mgard::pb::Encoding::RFMH); + return header; } template Array<1, Byte, DeviceType> CPUCompress(SubArray<1, C, DeviceType> &input_data) { - // PrintSubarray("CPUCompress input", input_data); - size_t input_count = input_data.getShape(0); C *in_data = NULL; @@ -183,14 +35,10 @@ Array<1, Byte, DeviceType> CPUCompress(SubArray<1, C, DeviceType> &input_data) { MemoryManager::Copy1D(in_data, input_data.data(), input_count, 0); DeviceRuntime::SyncQueue(0); - std::vector qv(input_count); - for (size_t i = 0; i < input_count; i++) { - qv[i] = (long int)in_data[i]; - } - - std::size_t actual_out_size; - unsigned char *lossless_data = compress_memory_huffman( - qv.data(), qv.size(), actual_out_size); + mgard::MemoryBuffer lossless_data_buffer = + mgard::compress(setup_header(), in_data, input_count * sizeof(C)); + unsigned char *lossless_data = lossless_data_buffer.data.get(); + std::size_t actual_out_size = lossless_data_buffer.size; uint8_t *out_data = NULL; MemoryManager::MallocHost(out_data, @@ -205,10 +53,6 @@ Array<1, Byte, DeviceType> CPUCompress(SubArray<1, C, DeviceType> &input_data) { MemoryManager::FreeHost(out_data); MemoryManager::FreeHost(in_data); - delete[] lossless_data; - - // PrintSubarray("CPUCompress output", SubArray(output_data)); - return output_data; } @@ -216,7 +60,6 @@ template Array<1, C, DeviceType> CPUDecompress(SubArray<1, Byte, DeviceType> &input_data) { - // PrintSubarray("CPUDecompress input", input_data); size_t input_count = input_data.getShape(0); Byte *in_data = NULL; MemoryManager::MallocHost(in_data, input_count, 0); @@ -225,19 +68,13 @@ CPUDecompress(SubArray<1, Byte, DeviceType> &input_data) { uint32_t actual_out_count = 0; actual_out_count = *reinterpret_cast(in_data); - // *oriData = (uint8_t*)malloc(outSize); C *out_data = NULL; MemoryManager::MallocHost(out_data, actual_out_count, 0); DeviceRuntime::SyncQueue(0); - long int *qv = new long int[actual_out_count]; - size_t out_size = actual_out_count * sizeof(long int); - decompress_memory_huffman( - in_data + sizeof(size_t), input_count - sizeof(size_t), qv, out_size); - - for (size_t i = 0; i < actual_out_count; i++) { - out_data[i] = (C)qv[i]; - } + mgard::decompress(setup_header(), in_data + sizeof(size_t), + input_count - sizeof(size_t), out_data, + actual_out_count * sizeof(C)); Array<1, C, DeviceType> output_data({(SIZE)actual_out_count}); output_data.load(out_data); @@ -245,8 +82,6 @@ CPUDecompress(SubArray<1, Byte, DeviceType> &input_data) { MemoryManager::FreeHost(out_data); MemoryManager::FreeHost(in_data); - // PrintSubarray("CPUDecompress output", SubArray(output_data)); - return output_data; } diff --git a/include/utilities.tpp b/include/utilities.tpp index 6260fa5af4..296e1f6f0c 100644 --- a/include/utilities.tpp +++ b/include/utilities.tpp @@ -201,6 +201,8 @@ bool operator!=(const RangeSlice &a, const RangeSlice &b) { return !operator==(a, b); } +#ifndef __NVCC__ + template CartesianProduct::CartesianProduct(const std::array factors) : factors(factors) { @@ -325,6 +327,8 @@ typename CartesianProduct::iterator::reference return value; } +#endif + template void check_dimension_index_bounds(const std::size_t dimension) { if (dimension >= N) { From eed1eb17c0ea5803571377daf28635bf25aaa4b5 Mon Sep 17 00:00:00 2001 From: Jieyang Chen Date: Mon, 18 Jul 2022 10:44:01 -0400 Subject: [PATCH 57/58] =?UTF-8?q?Put=20CartesianProduct=20and=20CartesianP?= =?UTF-8?q?roduct::iterator=20in=20#ifndef=20=5F=5FNVCC=5F=5F=20=E2=80=93?= =?UTF-8?q?=20#endif=20block?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/utilities.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/utilities.hpp b/include/utilities.hpp index 43b005a0d4..f687277eaa 100644 --- a/include/utilities.hpp +++ b/include/utilities.hpp @@ -293,6 +293,8 @@ bool operator==(const RangeSlice &a, const RangeSlice &b); template bool operator!=(const RangeSlice &a, const RangeSlice &b); +#ifndef __NVCC__ + //! Mimic Python's `itertools.product`. Allow iteration over the Cartesian //! product of a collection of ranges. //! @@ -411,6 +413,8 @@ template class CartesianProduct::iterator { std::array inner; }; +#endif + //! Check that a dimension index is in bounds. //! //!\param dimension Dimension index. From 4ef023d0affabc2023ab3c575af9c798055ac8d9 Mon Sep 17 00:00:00 2001 From: Ben Whitney Date: Tue, 19 Jul 2022 12:32:51 -0400 Subject: [PATCH 58/58] Add quantization type function template. --- include/format.hpp | 5 ++ .../mgard-x/CompressionHighLevel/Metadata.hpp | 53 +++++-------------- src/format.cpp | 16 ++++++ tests/src/test_format.cpp | 11 ++++ 4 files changed, 45 insertions(+), 40 deletions(-) diff --git a/include/format.hpp b/include/format.hpp index f3b166ffac..cbc7da334f 100644 --- a/include/format.hpp +++ b/include/format.hpp @@ -82,6 +82,11 @@ template bool big_endian(); //!\return `Dataset::Type` corresponding to `Real`. template pb::Dataset::Type type_to_dataset_type(); +//! Return the `Quantization::Type` value corrresponding to an integral type. +//! +//!\return `Quantization::Type` corresponding to `Int`. +template pb::Quantization::Type type_to_quantization_type(); + //! Allocate a quantization buffer of the proper alignment and size. //! //!\param header Self-describing dataset header. diff --git a/include/mgard-x/CompressionHighLevel/Metadata.hpp b/include/mgard-x/CompressionHighLevel/Metadata.hpp index 38519d6f9e..3ba5bb1dd4 100644 --- a/include/mgard-x/CompressionHighLevel/Metadata.hpp +++ b/include/mgard-x/CompressionHighLevel/Metadata.hpp @@ -86,11 +86,8 @@ template struct Metadata { std::cout << "Metadata size: " << metadata_size << "\n"; std::cout << "Metadata crc32: " << metadata_crc32 << "\n"; std::cout << "Endiness: "; - if (etype == endiness_type::Big_Endian) { - std::cout << "Big Endian\n"; - } else { - std::cout << "Little Endian\n"; - } + std::cout << (etype == endiness_type::Big_Endian ? "Big Endian\n" + : "Little Endian\n"); std::cout << "Data type: "; if (dtype == data_type::Float) { std::cout << "Float\n"; @@ -173,11 +170,8 @@ template struct Metadata { private: SERIALIZED_TYPE *SerializeAll(uint32_t &total_size) { - if (big_endian()) { - etype = endiness_type::Big_Endian; - } else { - etype = endiness_type::Little_Endian; - } + etype = big_endian() ? endiness_type::Big_Endian + : endiness_type::Little_Endian; total_size = 0; // about MGARD software @@ -469,21 +463,10 @@ template struct Metadata { mgard::pb::Quantization &quantization = *header.mutable_quantization(); quantization.set_method(mgard::pb::Quantization::COEFFICIENTWISE_LINEAR); quantization.set_bin_widths(mgard::pb::Quantization::PER_COEFFICIENT); - if (std::is_same::value) { - quantization.set_type(mgard::pb::Quantization::INT8_T); - } else if (std::is_same::value) { - quantization.set_type(mgard::pb::Quantization::INT16_T); - } else if (std::is_same::value) { - quantization.set_type(mgard::pb::Quantization::INT32_T); - } else if (std::is_same::value) { - quantization.set_type(mgard::pb::Quantization::INT64_T); - } - quantization.set_big_endian(big_endian()); - if (big_endian()) { - etype = endiness_type::Big_Endian; - } else { - etype = endiness_type::Little_Endian; - } + quantization.set_type(mgard::type_to_quantization_type()); + quantization.set_big_endian(big_endian()); + etype = big_endian() ? endiness_type::Big_Endian + : endiness_type::Little_Endian; } { // Encoding @@ -718,21 +701,11 @@ template struct Metadata { mgard::pb::Quantization::COEFFICIENTWISE_LINEAR); assert(quantization.bin_widths() == mgard::pb::Quantization::PER_COEFFICIENT); - if (std::is_same::value) { - assert(quantization.type() == mgard::pb::Quantization::INT8_T); - } else if (std::is_same::value) { - assert(quantization.type() == mgard::pb::Quantization::INT16_T); - } else if (std::is_same::value) { - assert(quantization.type() == mgard::pb::Quantization::INT32_T); - } else if (std::is_same::value) { - assert(quantization.type() == mgard::pb::Quantization::INT64_T); - } - assert(quantization.big_endian() == big_endian()); - if (big_endian()) { - etype = endiness_type::Big_Endian; - } else { - etype = endiness_type::Little_Endian; - } + assert(quantization.type() == + mgard::type_to_quantization_type()); + assert(quantization.big_endian() == big_endian()); + etype = big_endian() ? endiness_type::Big_Endian + : endiness_type::Little_Endian; } { // Encoding diff --git a/src/format.cpp b/src/format.cpp index e9cda8e756..6056c8d262 100644 --- a/src/format.cpp +++ b/src/format.cpp @@ -82,6 +82,22 @@ template <> pb::Dataset::Type type_to_dataset_type() { return pb::Dataset::DOUBLE; } +template <> pb::Quantization::Type type_to_quantization_type() { + return pb::Quantization::INT8_T; +} + +template <> pb::Quantization::Type type_to_quantization_type() { + return pb::Quantization::INT16_T; +} + +template <> pb::Quantization::Type type_to_quantization_type() { + return pb::Quantization::INT32_T; +} + +template <> pb::Quantization::Type type_to_quantization_type() { + return pb::Quantization::INT64_T; +} + MemoryBuffer quantization_buffer(const pb::Header &header, const std::size_t ndof) { static_assert(CHAR_BIT == 8, "unexpected number of bits in a byte"); diff --git a/tests/src/test_format.cpp b/tests/src/test_format.cpp index 47055e6ac1..48c0751c87 100644 --- a/tests/src/test_format.cpp +++ b/tests/src/test_format.cpp @@ -180,6 +180,17 @@ TEST_CASE("dataset types", "[format]") { REQUIRE(mgard::type_to_dataset_type() == mgard::pb::Dataset::DOUBLE); } +TEST_CASE("quantization types", "[format]") { + REQUIRE(mgard::type_to_quantization_type() == + mgard::pb::Quantization::INT8_T); + REQUIRE(mgard::type_to_quantization_type() == + mgard::pb::Quantization::INT16_T); + REQUIRE(mgard::type_to_quantization_type() == + mgard::pb::Quantization::INT32_T); + REQUIRE(mgard::type_to_quantization_type() == + mgard::pb::Quantization::INT64_T); +} + namespace { void test_quantization_buffer(const mgard::pb::Quantization::Type type,