From 3396c67fea14aef349905b90dfef0ff4ada1be8c Mon Sep 17 00:00:00 2001
From: Evgenii Kliuchnikov <eustas@google.com>
Date: Mon, 15 Jan 2024 12:49:21 -0800
Subject: add brcat alias + flag to decompress concatenated streams

PiperOrigin-RevId: 598652401
---
 c/tools/brotli.c  | 168 ++++++++++++++++++++++++++++++++++++++----------------
 c/tools/brotli.md |  10 +++-
 docs/brotli.1     |  12 +++-
 tests/cli_test.sh |  12 ++++
 4 files changed, 151 insertions(+), 51 deletions(-)

diff --git a/c/tools/brotli.c b/c/tools/brotli.c
index 56c60af..0dc99bd 100644
--- a/c/tools/brotli.c
+++ b/c/tools/brotli.c
@@ -112,7 +112,7 @@ typedef enum {
 
 #define DEFAULT_LGWIN 24
 #define DEFAULT_SUFFIX ".br"
-#define MAX_OPTIONS 20
+#define MAX_OPTIONS 24
 #define MAX_COMMENT_LEN 80
 
 typedef struct {
@@ -128,6 +128,7 @@ typedef struct {
   BROTLI_BOOL test_integrity;
   BROTLI_BOOL decompress;
   BROTLI_BOOL large_window;
+  BROTLI_BOOL allow_concatenated;
   const char* output_path;
   const char* dictionary_path;
   const char* suffix;
@@ -145,6 +146,7 @@ typedef struct {
   uint8_t* dictionary;
   size_t dictionary_size;
   BrotliEncoderPreparedDictionary* prepared_dictionary;
+  BrotliDecoderState* decoder;
   char* modified_path;  /* Storage for path with appended / cut suffix */
   int iterator;
   int ignore;
@@ -187,7 +189,7 @@ static BROTLI_BOOL ParseBase64(const char* str, uint8_t* out, size_t* out_len) {
   size_t octet_count = 0;
   for (i = 0; i < in_len; ++i) {
     char c = str[i];
-    uint32_t sextet = 0;
+    int sextet = 0;
     if (c == 9 || c == 10 || c == 13 || c == ' ') {
       continue;
     }
@@ -209,7 +211,7 @@ static BROTLI_BOOL ParseBase64(const char* str, uint8_t* out, size_t* out_len) {
     } else {
       return BROTLI_FALSE;
     }
-    bits = (bits << 6) | sextet;
+    bits = (bits << 6) | (uint32_t)sextet;
     bit_count += 6;
     if (bit_count >= 8) {
       if (octet_count == max_out_len) return BROTLI_FALSE;
@@ -250,17 +252,16 @@ static const char* FileName(const char* path) {
 }
 
 /* Detect if the program name is a special alias that infers a command type. */
-static Command ParseAlias(const char* name) {
+static BROTLI_BOOL CheckAlias(const char* name, const char* alias) {
   /* TODO: cast name to lower case? */
-  const char* unbrotli = "unbrotli";
-  size_t unbrotli_len = strlen(unbrotli);
+  size_t alias_len = strlen(alias);
   name = FileName(name);
   /* Partial comparison. On Windows there could be ".exe" suffix. */
-  if (strncmp(name, unbrotli, unbrotli_len) == 0) {
-    char terminator = name[unbrotli_len];
-    if (terminator == 0 || terminator == '.') return COMMAND_DECOMPRESS;
+  if (strncmp(name, alias, alias_len) == 0) {
+    char terminator = name[alias_len];
+    if (terminator == 0 || terminator == '.') return BROTLI_TRUE;
   }
-  return COMMAND_COMPRESS;
+  return BROTLI_FALSE;
 }
 
 static Command ParseParams(Context* params) {
@@ -279,7 +280,20 @@ static Command ParseParams(Context* params) {
   BROTLI_BOOL suffix_set = BROTLI_FALSE;
   BROTLI_BOOL after_dash_dash = BROTLI_FALSE;
   BROTLI_BOOL comment_set = BROTLI_FALSE;
-  Command command = ParseAlias(argv[0]);
+  BROTLI_BOOL concatenated_set = BROTLI_FALSE;
+  Command command = COMMAND_COMPRESS;
+
+  if (CheckAlias(argv[0], "brcat")) {
+    command_set = BROTLI_TRUE;
+    command = COMMAND_DECOMPRESS;
+    concatenated_set = BROTLI_TRUE;
+    params->allow_concatenated = BROTLI_TRUE;
+    output_set = BROTLI_TRUE;
+    params->write_to_stdout = BROTLI_TRUE;
+  } else if (CheckAlias(argv[0], "unbrotli")) {
+    command_set = BROTLI_TRUE;
+    command = COMMAND_DECOMPRESS;
+  }
 
   for (i = 1; i < argc; ++i) {
     const char* arg = argv[i];
@@ -293,7 +307,7 @@ static Command ParseParams(Context* params) {
     }
 
     /* Too many options. The expected longest option list is:
-       "-q 0 -w 10 -o f -D d -S b -d -f -k -n -v --", i.e. 16 items in total.
+       "-q 0 -w 10 -o f -D d -S b -d -f -k -n -v -K --", i.e. 17 items in total.
        This check is an additional guard that is never triggered, but provides
        a guard for future changes. */
     if (next_option_index > (MAX_OPTIONS - 2)) {
@@ -394,6 +408,14 @@ static Command ParseParams(Context* params) {
           }
           params->verbosity = 1;
           continue;
+        } else if (c == 'K') {
+          if (concatenated_set) {
+            fprintf(stderr, "argument -K / --concatenated already set\n");
+            return COMMAND_INVALID;
+          }
+          concatenated_set = BROTLI_TRUE;
+          params->allow_concatenated = BROTLI_TRUE;
+          continue;
         } else if (c == 'V') {
           /* Don't parse further. */
           return COMMAND_VERSION;
@@ -491,6 +513,14 @@ static Command ParseParams(Context* params) {
         }
         quality_set = BROTLI_TRUE;
         params->quality = 11;
+      } else if (strcmp("concatenated", arg) == 0) {
+        if (concatenated_set) {
+          fprintf(stderr, "argument -K / --concatenated already set\n");
+          return COMMAND_INVALID;
+        }
+        concatenated_set = BROTLI_TRUE;
+        params->allow_concatenated = BROTLI_TRUE;
+        continue;
       } else if (strcmp("decompress", arg) == 0) {
         if (command_set) {
           fprintf(stderr, "command already set when parsing --decompress\n");
@@ -669,6 +699,12 @@ static Command ParseParams(Context* params) {
   if (strchr(params->suffix, '/') || strchr(params->suffix, '\\')) {
     return COMMAND_INVALID;
   }
+  if (!params->decompress && params->allow_concatenated) {
+    return COMMAND_INVALID;
+  }
+  if (params->allow_concatenated && params->comment_len) {
+    return COMMAND_INVALID;
+  }
 
   return command;
 }
@@ -724,7 +760,8 @@ static void PrintHelp(const char* name, BROTLI_BOOL error) {
 "                              when encoding: embed comment (fingerprint)\n",
           MAX_COMMENT_LEN);
   fprintf(media,
-"  -D FILE, --dictionary=FILE  use FILE as raw (LZ77) dictionary\n");
+"  -D FILE, --dictionary=FILE  use FILE as raw (LZ77) dictionary\n"
+"  -K, --concatenated          allows concatenated brotli streams as input\n");
   fprintf(media,
 "  -S SUF, --suffix=SUF        output file suffix (default:'%s')\n",
           DEFAULT_SUFFIX);
@@ -1086,6 +1123,7 @@ static BROTLI_BOOL ProvideOutput(Context* context) {
 static BROTLI_BOOL FlushOutput(Context* context) {
   if (!WriteOutput(context)) return BROTLI_FALSE;
   context->available_out = 0;
+  context->next_out = context->output;
   return BROTLI_TRUE;
 }
 
@@ -1157,7 +1195,27 @@ static void OnMetadataChunk(void* opaque, const uint8_t* data, size_t size) {
   }
 }
 
-static BROTLI_BOOL DecompressFile(Context* context, BrotliDecoderState* s) {
+static BROTLI_BOOL InitDecoder(Context* context) {
+  context->decoder = BrotliDecoderCreateInstance(NULL, NULL, NULL);
+  if (!context->decoder) {
+    fprintf(stderr, "out of memory\n");
+    return BROTLI_FALSE;
+  }
+  /* This allows decoding "large-window" streams. Though it creates
+      fragmentation (new builds decode streams that old builds don't),
+      it is better from used experience perspective. */
+  BrotliDecoderSetParameter(
+      context->decoder, BROTLI_DECODER_PARAM_LARGE_WINDOW, 1u);
+  if (context->dictionary) {
+    BrotliDecoderAttachDictionary(context->decoder,
+        BROTLI_SHARED_DICTIONARY_RAW, context->dictionary_size,
+        context->dictionary);
+  }
+  return BROTLI_TRUE;
+}
+
+static BROTLI_BOOL DecompressFile(Context* context) {
+  BrotliDecoderState* s = context->decoder;
   BrotliDecoderResult result = BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT;
   if (context->comment_len) {
     context->comment_state = COMMENT_INIT;
@@ -1192,31 +1250,52 @@ static BROTLI_BOOL DecompressFile(Context* context, BrotliDecoderState* s) {
       if (!ProvideOutput(context)) return BROTLI_FALSE;
     } else if (result == BROTLI_DECODER_RESULT_SUCCESS) {
       if (!FlushOutput(context)) return BROTLI_FALSE;
-      int has_more_input =
-          (context->available_in != 0) || (fgetc(context->fin) != EOF);
-      if (has_more_input) {
-        fprintf(stderr, "corrupt input [%s]\n",
-                PrintablePath(context->current_input_path));
-        if (context->verbosity > 0) {
-          fprintf(stderr, "reason: extra input\n");
+      BROTLI_BOOL has_more_input = (context->available_in != 0);
+      int extra_char = EOF;
+      if (!has_more_input) {
+        extra_char = fgetc(context->fin);
+        if (extra_char != EOF) {
+          has_more_input = BROTLI_TRUE;
+          context->input[0] = (uint8_t)extra_char;
+          context->next_in = context->input;
+          context->available_in = 1;
         }
-        return BROTLI_FALSE;
-      }
-      if (context->verbosity > 0) {
-        context->end_time = clock();
-        fprintf(stderr, "Decompressed ");
-        PrintFileProcessingProgress(context);
-        fprintf(stderr, "\n");
       }
-      /* Final check */
-      if (context->comment_state != COMMENT_OK) {
-        fprintf(stderr, "corrupt input [%s]\n",
-                PrintablePath(context->current_input_path));
+      if (has_more_input) {
+        if (context->allow_concatenated) {
+          if (context->verbosity > 0) {
+            fprintf(stderr, "extra input\n");
+          }
+          if (!ProvideOutput(context)) return BROTLI_FALSE;
+          BrotliDecoderDestroyInstance(context->decoder);
+          context->decoder = NULL;
+          if (!InitDecoder(context)) return BROTLI_FALSE;
+          s = context->decoder;
+        } else {
+          fprintf(stderr, "corrupt input [%s]\n",
+                  PrintablePath(context->current_input_path));
+          if (context->verbosity > 0) {
+            fprintf(stderr, "reason: extra input\n");
+          }
+          return BROTLI_FALSE;
+        }
+      } else {
         if (context->verbosity > 0) {
-          fprintf(stderr, "reason: comment mismatch\n");
+          context->end_time = clock();
+          fprintf(stderr, "Decompressed ");
+          PrintFileProcessingProgress(context);
+          fprintf(stderr, "\n");
+        }
+        /* Final check */
+        if (context->comment_state != COMMENT_OK) {
+          fprintf(stderr, "corrupt input [%s]\n",
+                  PrintablePath(context->current_input_path));
+          if (context->verbosity > 0) {
+            fprintf(stderr, "reason: comment mismatch\n");
+          }
         }
+        return BROTLI_TRUE;
       }
-      return BROTLI_TRUE;
     } else {  /* result == BROTLI_DECODER_RESULT_ERROR */
       fprintf(stderr, "corrupt input [%s]\n",
               PrintablePath(context->current_input_path));
@@ -1238,27 +1317,16 @@ static BROTLI_BOOL DecompressFiles(Context* context) {
     BROTLI_BOOL is_ok = BROTLI_TRUE;
     BROTLI_BOOL rm_input = BROTLI_FALSE;
     BROTLI_BOOL rm_output = BROTLI_TRUE;
-    BrotliDecoderState* s = BrotliDecoderCreateInstance(NULL, NULL, NULL);
-    if (!s) {
-      fprintf(stderr, "out of memory\n");
-      return BROTLI_FALSE;
-    }
-    /* This allows decoding "large-window" streams. Though it creates
-       fragmentation (new builds decode streams that old builds don't),
-       it is better from used experience perspective. */
-    BrotliDecoderSetParameter(s, BROTLI_DECODER_PARAM_LARGE_WINDOW, 1u);
-    if (context->dictionary) {
-      BrotliDecoderAttachDictionary(s, BROTLI_SHARED_DICTIONARY_RAW,
-          context->dictionary_size, context->dictionary);
-    }
+    if (!InitDecoder(context)) return BROTLI_FALSE;
     is_ok = OpenFiles(context);
     if (is_ok && !context->current_input_path &&
         !context->force_overwrite && isatty(STDIN_FILENO)) {
       fprintf(stderr, "Use -h help. Use -f to force input from a terminal.\n");
       is_ok = BROTLI_FALSE;
     }
-    if (is_ok) is_ok = DecompressFile(context, s);
-    BrotliDecoderDestroyInstance(s);
+    if (is_ok) is_ok = DecompressFile(context);
+    if (context->decoder) BrotliDecoderDestroyInstance(context->decoder);
+    context->decoder = NULL;
     rm_output = !is_ok;
     rm_input = !rm_output && context->junk_source;
     if (!CloseFiles(context, rm_input, rm_output)) is_ok = BROTLI_FALSE;
@@ -1408,6 +1476,7 @@ int main(int argc, char** argv) {
   context.write_to_stdout = BROTLI_FALSE;
   context.decompress = BROTLI_FALSE;
   context.large_window = BROTLI_FALSE;
+  context.allow_concatenated = BROTLI_FALSE;
   context.output_path = NULL;
   context.dictionary_path = NULL;
   context.suffix = DEFAULT_SUFFIX;
@@ -1419,6 +1488,7 @@ int main(int argc, char** argv) {
   context.argv = argv;
   context.dictionary = NULL;
   context.dictionary_size = 0;
+  context.decoder = NULL;
   context.prepared_dictionary = NULL;
   context.modified_path = NULL;
   context.iterator = 0;
diff --git a/c/tools/brotli.md b/c/tools/brotli.md
index cb6d6f3..8792314 100644
--- a/c/tools/brotli.md
+++ b/c/tools/brotli.md
@@ -1,11 +1,13 @@
 # NAME
 
-brotli(1) -- brotli, unbrotli - compress or decompress files
+brotli(1) -- brotli, brcat, unbrotli - compress or decompress files
 
 # SYNOPSIS
 
 `brotli` [*OPTION|FILE*]...
 
+`brcat` is equivalent to `brotli --decompress --concatenated --stdout`
+
 `unbrotli` is equivalent to `brotli --decompress`
 
 # DESCRIPTION
@@ -83,9 +85,15 @@ Conflicting or duplicate _options_ are not allowed.
     `(pow(2, NUM) - 16)`; 0 lets compressor decide over the optimal value;
     bigger windows size improve density; decoder might require up to window size
     memory to operate
+* `-C B64`, `--comment=B64`:
+    set comment; argument is base64-decoded first;
+    when decoding: check stream comment;
+    when encoding: embed comment (fingerprint)
 * `-D FILE`, `--dictionary=FILE`:
     use FILE as raw (LZ77) dictionary; same dictionary MUST be used both for
     compression and decompression
+* `-K`, `--concatenated`:
+    when decoding, allow concatenated brotli streams as input
 * `-S SUF`, `--suffix=SUF`:
     output file suffix (default: `.br`)
 * `-V`, `--version`:
diff --git a/docs/brotli.1 b/docs/brotli.1
index 7ca1355..14a4de1 100644
--- a/docs/brotli.1
+++ b/docs/brotli.1
@@ -4,11 +4,14 @@
 .hy
 .SH NAME
 .PP
-brotli(1) -- brotli, unbrotli - compress or decompress files
+brotli(1) -- brotli, brcat, unbrotli - compress or decompress files
 .SH SYNOPSIS
 .PP
 \f[B]brotli\f[R] [\f[I]OPTION|FILE\f[R]]\&...
 .PP
+\f[B]brcat\f[R] is equivalent to \f[B]brotli --decompress --concatenated
+--stdout\f[R]
+.PP
 \f[B]unbrotli\f[R] is equivalent to \f[B]brotli --decompress\f[R]
 .SH DESCRIPTION
 .PP
@@ -104,10 +107,17 @@ bigger values cause denser, but slower compression
 compressor decide over the optimal value; bigger windows size improve
 density; decoder might require up to window size memory to operate
 .IP \[bu] 2
+\f[B]-C B64\f[R], \f[B]--comment=B64\f[R]: set comment; argument is
+base64-decoded first; when decoding: check stream comment; when
+encoding: embed comment (fingerprint)
+.IP \[bu] 2
 \f[B]-D FILE\f[R], \f[B]--dictionary=FILE\f[R]: use FILE as raw (LZ77)
 dictionary; same dictionary MUST be used both for compression and
 decompression
 .IP \[bu] 2
+\f[B]-K\f[R], \f[B]--concatenated\f[R]: when decoding, allow
+concatenated brotli streams as input
+.IP \[bu] 2
 \f[B]-S SUF\f[R], \f[B]--suffix=SUF\f[R]: output file suffix (default:
 \f[B].br\f[R])
 .IP \[bu] 2
diff --git a/tests/cli_test.sh b/tests/cli_test.sh
index b6c563e..a8e0208 100755
--- a/tests/cli_test.sh
+++ b/tests/cli_test.sh
@@ -13,6 +13,7 @@ function test::brotli_cli::setup() {
   BROTLI="${BROTLI_PKG}/tools/brotli"
   cd ${TEMP_DIR}
   echo "Kot lomom kolol slona" > text.orig
+  echo "Lorem ipsum dolor sit amet. " > ipsum.orig
 }
 
 function test::brotli_cli::teardown() {
@@ -81,4 +82,15 @@ function test::brotli_cli::comment_invalid_chars() {
   EXPECT_FAIL "${BROTLI} -Zfk -C S.GVsbG8= text.orig -o text.br"
 }
 
+function test::brotli_cli::concatenated() {
+  ${BROTLI} -Zfk ipsum.orig -o one.br
+  ${BROTLI} -Zfk text.orig -o two.br
+  cat one.br two.br > full.br
+  EXPECT_FAIL "${BROTLI} -dc full.br  > full.unbr"
+  EXPECT_SUCCEED "${BROTLI} -dKc full.br > full.unbr"
+  EXPECT_SUCCEED "${BROTLI} -dc --concatenated full.br > full.unbr"
+  cat ipsum.orig text.orig > full.orig
+  EXPECT_FILE_CONTENT_EQ full.orig full.unbr
+}
+
 gbash::unit::main "$@"
-- 
cgit v1.1