From 033940f97cfa5e708c46961de7e85d5e4976ee40 Mon Sep 17 00:00:00 2001 From: Evgenii Kliuchnikov Date: Thu, 11 Jan 2024 02:03:59 -0800 Subject: add comment (fingerprint) CLI feature PiperOrigin-RevId: 597489910 --- c/tools/brotli.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++---- tests/cli_test.sh | 84 +++++++++++++++++++++++ 2 files changed, 266 insertions(+), 12 deletions(-) create mode 100755 tests/cli_test.sh diff --git a/c/tools/brotli.c b/c/tools/brotli.c index 03f2d06..56c60af 100644 --- a/c/tools/brotli.c +++ b/c/tools/brotli.c @@ -103,9 +103,17 @@ typedef enum { COMMAND_VERSION } Command; +typedef enum { + COMMENT_INIT, + COMMENT_READ, + COMMENT_OK, + COMMENT_BAD, +} CommentState; + #define DEFAULT_LGWIN 24 #define DEFAULT_SUFFIX ".br" #define MAX_OPTIONS 20 +#define MAX_COMMENT_LEN 80 typedef struct { /* Parameters */ @@ -123,6 +131,10 @@ typedef struct { const char* output_path; const char* dictionary_path; const char* suffix; + uint8_t comment[MAX_COMMENT_LEN]; + size_t comment_len; + size_t comment_pos; + CommentState comment_state; int not_input_indices[MAX_OPTIONS]; size_t longest_path_len; size_t input_count; @@ -161,6 +173,55 @@ typedef struct { clock_t end_time; } Context; +/* Parse base 64 encoded string to buffer. Not performance-centric. + |out_len| as input is buffer size; |out_len| as output is decoded length. + Returns BROTLI_FALSE if either input is not (relaxed) base 64 conformant, + or output does not fit buffer. */ +static BROTLI_BOOL ParseBase64(const char* str, uint8_t* out, size_t* out_len) { + size_t in_len = strlen(str); + size_t max_out_len = *out_len; + size_t i; + size_t bit_count = 0; + uint32_t bits = 0; + size_t padding_count = 0; + size_t octet_count = 0; + for (i = 0; i < in_len; ++i) { + char c = str[i]; + uint32_t sextet = 0; + if (c == 9 || c == 10 || c == 13 || c == ' ') { + continue; + } + if (c == '=') { + padding_count++; + continue; + } + if (padding_count) return BROTLI_FALSE; + if (c == '+' || c == '-') { + sextet = 62; + } else if (c == '/' || c == '_') { + sextet = 63; + } else if (c >= 'A' && c <= 'Z') { + sextet = c - 'A'; + } else if (c >= 'a' && c <= 'z') { + sextet = c - 'a' + 26; + } else if (c >= '0' && c <= '9') { + sextet = c - '0' + 52; + } else { + return BROTLI_FALSE; + } + bits = (bits << 6) | sextet; + bit_count += 6; + if (bit_count >= 8) { + if (octet_count == max_out_len) return BROTLI_FALSE; + bit_count -= 8; + out[octet_count++] = (bits >> bit_count) & 0xFF; + } + } + if (padding_count > 2) return BROTLI_FALSE; + *out_len = octet_count; + return BROTLI_TRUE; +} + /* Parse up to 5 decimal digits. */ static BROTLI_BOOL ParseInt(const char* s, int low, int high, int* result) { int value = 0; @@ -217,6 +278,7 @@ static Command ParseParams(Context* params) { BROTLI_BOOL lgwin_set = BROTLI_FALSE; BROTLI_BOOL suffix_set = BROTLI_FALSE; BROTLI_BOOL after_dash_dash = BROTLI_FALSE; + BROTLI_BOOL comment_set = BROTLI_FALSE; Command command = ParseAlias(argv[0]); for (i = 1; i < argc; ++i) { @@ -344,8 +406,9 @@ static Command ParseParams(Context* params) { params->quality = 11; continue; } - /* o/q/w/D/S with parameter is expected */ - if (c != 'o' && c != 'q' && c != 'w' && c != 'D' && c != 'S') { + /* o/q/w/C/D/S with parameter is expected */ + if (c != 'o' && c != 'q' && c != 'w' && c != 'C' && c != 'D' && + c != 'S') { fprintf(stderr, "invalid argument -%c\n", c); return COMMAND_INVALID; } @@ -393,6 +456,17 @@ static Command ParseParams(Context* params) { params->lgwin, BROTLI_MIN_WINDOW_BITS); return COMMAND_INVALID; } + } else if (c == 'C') { + if (comment_set) { + fprintf(stderr, "comment already set\n"); + return COMMAND_INVALID; + } + params->comment_len = MAX_COMMENT_LEN; + if (!ParseBase64(argv[i], params->comment, ¶ms->comment_len)) { + fprintf(stderr, "invalid base64-encoded comment\n"); + return COMMAND_INVALID; + } + comment_set = BROTLI_TRUE; } else if (c == 'D') { if (params->dictionary_path) { fprintf(stderr, "dictionary path already set\n"); @@ -486,7 +560,7 @@ static Command ParseParams(Context* params) { return COMMAND_VERSION; } else { /* key=value */ - const char* value = strrchr(arg, '='); + const char* value = strchr(arg, '='); size_t key_len; if (!value || value[1] == 0) { fprintf(stderr, "must pass the parameter as --%s=value\n", arg); @@ -494,7 +568,18 @@ static Command ParseParams(Context* params) { } key_len = (size_t)(value - arg); value++; - if (strncmp("dictionary", arg, key_len) == 0) { + if (strncmp("comment", arg, key_len) == 0) { + if (comment_set) { + fprintf(stderr, "comment already set\n"); + return COMMAND_INVALID; + } + params->comment_len = MAX_COMMENT_LEN; + if (!ParseBase64(value, params->comment, ¶ms->comment_len)) { + fprintf(stderr, "invalid base64-encoded comment\n"); + return COMMAND_INVALID; + } + comment_set = BROTLI_TRUE; + } else if (strncmp("dictionary", arg, key_len) == 0) { if (params->dictionary_path) { fprintf(stderr, "dictionary path already set\n"); return COMMAND_INVALID; @@ -633,6 +718,12 @@ static void PrintHelp(const char* name, BROTLI_BOOL error) { " decodable with regular brotli decoders\n", BROTLI_MIN_WINDOW_BITS, BROTLI_LARGE_MAX_WINDOW_BITS); fprintf(media, +" -C B64, --comment=B64 set comment; argument is base64-decoded first;\n" +" (maximal decoded length: %d)\n" +" when decoding: check stream comment;\n" +" when encoding: embed comment (fingerprint)\n", + MAX_COMMENT_LEN); + fprintf(media, " -D FILE, --dictionary=FILE use FILE as raw (LZ77) dictionary\n"); fprintf(media, " -S SUF, --suffix=SUF output file suffix (default:'%s')\n", @@ -1034,10 +1125,59 @@ static const char* PrettyDecoderErrorString(BrotliDecoderErrorCode code) { return error_str; } +static void OnMetadataStart(void* opaque, size_t size) { + Context* context = (Context*) opaque; + if (context->comment_state == COMMENT_INIT) { + if (context->comment_len != size) { + context->comment_state = COMMENT_BAD; + return; + } + context->comment_pos = 0; + context->comment_state = COMMENT_READ; + } +} + +static void OnMetadataChunk(void* opaque, const uint8_t* data, size_t size) { + Context* context = (Context*) opaque; + if (context->comment_state == COMMENT_READ) { + size_t i; + for (i = 0; i < size; ++i) { + if (context->comment_pos >= context->comment_len) { + context->comment_state = COMMENT_BAD; + return; + } + if (context->comment[context->comment_pos++] != data[i]) { + context->comment_state = COMMENT_BAD; + return; + } + } + if (context->comment_pos == context->comment_len) { + context->comment_state = COMMENT_OK; + } + } +} + static BROTLI_BOOL DecompressFile(Context* context, BrotliDecoderState* s) { BrotliDecoderResult result = BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT; + if (context->comment_len) { + context->comment_state = COMMENT_INIT; + BrotliDecoderSetMetadataCallbacks(s, &OnMetadataStart, &OnMetadataChunk, + (void*)context); + } else { + context->comment_state = COMMENT_OK; + } + InitializeBuffers(context); for (;;) { + /* Early check */ + if (context->comment_state == COMMENT_BAD) { + fprintf(stderr, "corrupt input [%s]\n", + PrintablePath(context->current_input_path)); + if (context->verbosity > 0) { + fprintf(stderr, "reason: comment mismatch\n"); + } + return BROTLI_FALSE; + } if (result == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT) { if (!HasMoreInput(context)) { fprintf(stderr, "corrupt input [%s]\n", @@ -1068,6 +1208,14 @@ static BROTLI_BOOL DecompressFile(Context* context, BrotliDecoderState* s) { PrintFileProcessingProgress(context); fprintf(stderr, "\n"); } + /* Final check */ + if (context->comment_state != COMMENT_OK) { + fprintf(stderr, "corrupt input [%s]\n", + PrintablePath(context->current_input_path)); + if (context->verbosity > 0) { + fprintf(stderr, "reason: comment mismatch\n"); + } + } return BROTLI_TRUE; } else { /* result == BROTLI_DECODER_RESULT_ERROR */ fprintf(stderr, "corrupt input [%s]\n", @@ -1121,6 +1269,7 @@ static BROTLI_BOOL DecompressFiles(Context* context) { static BROTLI_BOOL CompressFile(Context* context, BrotliEncoderState* s) { BROTLI_BOOL is_eof = BROTLI_FALSE; + BROTLI_BOOL prologue = !!context->comment_len; InitializeBuffers(context); for (;;) { if (context->available_in == 0 && !is_eof) { @@ -1128,14 +1277,34 @@ static BROTLI_BOOL CompressFile(Context* context, BrotliEncoderState* s) { is_eof = !HasMoreInput(context); } - if (!BrotliEncoderCompressStream(s, - is_eof ? BROTLI_OPERATION_FINISH : BROTLI_OPERATION_PROCESS, - &context->available_in, &context->next_in, - &context->available_out, &context->next_out, NULL)) { - /* Should detect OOM? */ - fprintf(stderr, "failed to compress data [%s]\n", - PrintablePath(context->current_input_path)); - return BROTLI_FALSE; + if (prologue) { + prologue = BROTLI_FALSE; + const uint8_t* next_meta = context->comment; + size_t available_meta = context->comment_len; + if (!BrotliEncoderCompressStream(s, + BROTLI_OPERATION_EMIT_METADATA, + &available_meta, &next_meta, + &context->available_out, &context->next_out, NULL)) { + /* Should detect OOM? */ + fprintf(stderr, "failed to emit metadata [%s]\n", + PrintablePath(context->current_input_path)); + return BROTLI_FALSE; + } + if (available_meta != 0) { + fprintf(stderr, "failed to emit metadata [%s]\n", + PrintablePath(context->current_input_path)); + return BROTLI_FALSE; + } + } else { + if (!BrotliEncoderCompressStream(s, + is_eof ? BROTLI_OPERATION_FINISH : BROTLI_OPERATION_PROCESS, + &context->available_in, &context->next_in, + &context->available_out, &context->next_out, NULL)) { + /* Should detect OOM? */ + fprintf(stderr, "failed to compress data [%s]\n", + PrintablePath(context->current_input_path)); + return BROTLI_FALSE; + } } if (context->available_out == 0) { @@ -1230,6 +1399,7 @@ int main(int argc, char** argv) { context.quality = 11; context.lgwin = -1; context.verbosity = 0; + context.comment_len = 0; context.force_overwrite = BROTLI_FALSE; context.junk_source = BROTLI_FALSE; context.reject_uncompressible = BROTLI_FALSE; diff --git a/tests/cli_test.sh b/tests/cli_test.sh new file mode 100755 index 0000000..b6c563e --- /dev/null +++ b/tests/cli_test.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +source gbash.sh || exit +source module gbash_unit.sh + +readonly MKTEMP='/bin/mktemp' +readonly RM='/bin/rm' + +function test::brotli_cli::setup() { + TEMP_DIR=$(${MKTEMP} -d) + LOG INFO ${TEMP_DIR} + BROTLI_PKG="${RUNFILES}/google3/third_party/brotli" + BROTLI="${BROTLI_PKG}/tools/brotli" + cd ${TEMP_DIR} + echo "Kot lomom kolol slona" > text.orig +} + +function test::brotli_cli::teardown() { + unset BROTLI + unset BROTLI_PKG + ${RM} -rf ${TEMP_DIR} + unset TEMP_DIR +} + +function test::brotli_cli::roundtrip() { + ${BROTLI} -Zfk text.orig -o text.br + ${BROTLI} -d text.br -o text.unbr + EXPECT_FILE_CONTENT_EQ text.orig text.unbr +} + +# 'SGVsbG8=' == $(echo -n "Hello" | base64) + +function test::brotli_cli::comment_ok() { + ${BROTLI} -Zfk -C SGVsbG8= text.orig -o text.br + ${BROTLI} -d --comment=SGVsbG8= text.br -o text.unbr + EXPECT_FILE_CONTENT_EQ text.orig text.unbr +} + +function test::brotli_cli::comment_no_padding() { + ${BROTLI} -Zfk -C SGVsbG8 text.orig -o text.br + ${BROTLI} -d --comment=SGVsbG8= text.br -o text.unbr + EXPECT_FILE_CONTENT_EQ text.orig text.unbr +} + +function test::brotli_cli::comment_extra_padding() { + ${BROTLI} -Zfk -C SGVsbG8 text.orig -o text.br + ${BROTLI} -d --comment=SGVsbG8== text.br -o text.unbr + EXPECT_FILE_CONTENT_EQ text.orig text.unbr +} + +function test::brotli_cli::comment_ignored() { + ${BROTLI} -Zfk -C SGVsbG8= text.orig -o text.br + EXPECT_SUCCEED "${BROTLI} -d text.br -o text.unbr" +} + +function test::brotli_cli::comment_mismatch_content() { + ${BROTLI} -Zfk --comment=SGVsbG8= text.orig -o text.br + EXPECT_FAIL "${BROTLI} -dC SGVsbG7= text.br -o text.unbr" + EXPECT_FAIL "${BROTLI} -tC SGVsbG7= text.br" +} + +function test::brotli_cli::comment_mismatch_length() { + ${BROTLI} -Zfk --comment=SGVsbG8= text.orig -o text.br + EXPECT_FAIL "${BROTLI} -tC SGVsbA== text.br" +} + +function test::brotli_cli::comment_too_much_padding() { + EXPECT_FAIL "${BROTLI} -Zfk -C SGVsbG8=== text.orig -o text.br" +} + +function test::brotli_cli::comment_padding_in_the_middle() { + EXPECT_FAIL "${BROTLI} -Zfk -C SGVsbG=8 text.orig -o text.br" +} + +function test::brotli_cli::comment_ignore_tab_cr_lf_sp() { + COMMENT=$'S\tG\rV\ns bG8=' + EXPECT_SUCCEED "${BROTLI} -Zfk -C \"${COMMENT}\" text.orig -o text.br" +} + +function test::brotli_cli::comment_invalid_chars() { + EXPECT_FAIL "${BROTLI} -Zfk -C S.GVsbG8= text.orig -o text.br" +} + +gbash::unit::main "$@" -- cgit v1.1