From f7a8660b9f9e33e17af24384281f0c5ee30f76c4 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sun, 16 Feb 2020 22:29:16 +0100 Subject: [PATCH 1/6] perf: add a test for `git archive --format=tgz` We are about to play with a mode that does not require an external `gzip` to be spawned, and naturally we will want to know about the (speed) performance impact of that. Signed-off-by: Johannes Schindelin --- t/perf/p5005-archive-tgz.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100755 t/perf/p5005-archive-tgz.sh diff --git a/t/perf/p5005-archive-tgz.sh b/t/perf/p5005-archive-tgz.sh new file mode 100755 index 00000000000000..720e51e75fba6e --- /dev/null +++ b/t/perf/p5005-archive-tgz.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +test_description='Test archive --format=tgz performance' + +. ./perf-lib.sh + +test_perf_default_repo + +test_perf 'archive --format=tgz' ' + git archive --format=tgz HEAD >/dev/null +' + +test_done + From 732ba496150ed4bd11097c703ef1bed57417da1a Mon Sep 17 00:00:00 2001 From: Rohit Ashiwal Date: Fri, 15 Feb 2019 19:03:57 +0530 Subject: [PATCH 2/6] archive: factor out writing blocks into a separate function The `git archive --format=tgz` command spawns `gzip` to perform the actual compression. However, the MinGit flavor of Git for Windows comes without `gzip` bundled inside. To help with that, we will teach `git archive` to let zlib perform the actual compression. In preparation for this, we consolidate all the block writes into the function `write_block_or_die()`. Note: .tar files have a well-defined, fixed block size. For that reason, it does not make any sense to pass anything but a fully-populated, full-length block to the `write_block_or_die()` function, and we can save ourselves some future trouble by not even allowing to pass an incorrect `size` parameter to it. Signed-off-by: Rohit Ashiwal Signed-off-by: Johannes Schindelin --- archive-tar.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/archive-tar.c b/archive-tar.c index e16d3f756ddd61..e99d3bfe22888c 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -38,11 +38,16 @@ static int write_tar_filter_archive(const struct archiver *ar, #define USTAR_MAX_MTIME 077777777777ULL #endif +/* writes out the whole block, or dies if fails */ +static void write_block_or_die(const char *block) { + write_or_die(1, block, BLOCKSIZE); +} + /* writes out the whole block, but only if it is full */ static void write_if_needed(void) { if (offset == BLOCKSIZE) { - write_or_die(1, block, BLOCKSIZE); + write_block_or_die(block); offset = 0; } } @@ -66,7 +71,7 @@ static void do_write_blocked(const void *data, unsigned long size) write_if_needed(); } while (size >= BLOCKSIZE) { - write_or_die(1, buf, BLOCKSIZE); + write_block_or_die(buf); size -= BLOCKSIZE; buf += BLOCKSIZE; } @@ -101,10 +106,10 @@ static void write_trailer(void) { int tail = BLOCKSIZE - offset; memset(block + offset, 0, tail); - write_or_die(1, block, BLOCKSIZE); + write_block_or_die(block); if (tail < 2 * RECORDSIZE) { memset(block, 0, offset); - write_or_die(1, block, BLOCKSIZE); + write_block_or_die(block); } } From 7dcdec98cfb33252717a0a4091f117a128841656 Mon Sep 17 00:00:00 2001 From: Rohit Ashiwal Date: Tue, 19 Feb 2019 22:28:41 +0530 Subject: [PATCH 3/6] archive: optionally use zlib directly for gzip compression As we already link to the zlib library, we can perform the compression without even requiring gzip on the host machine. Note: the `-n` flag that `git archive` passed to `gzip` wants to ensure that a reproducible file is written, i.e. no filename or mtime will be recorded in the compressed output. This is already the default for zlib's `gzopen()` function (if the file name or mtime should be recorded, the `deflateSetHeader()` function would have to be called instead). Note also that the `gzFile` datatype is defined as a pointer in `zlib.h`, i.e. we can rely on the fact that it can be `NULL`. At this point, this new mode is hidden behind the pseudo command `:internal-gzip`: assign this magic string to the `archive.tgz.command` config setting to enable it. Signed-off-by: Rohit Ashiwal Signed-off-by: Johannes Schindelin --- archive-tar.c | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/archive-tar.c b/archive-tar.c index e99d3bfe22888c..19b502d8a40f1f 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -17,6 +17,8 @@ static unsigned long offset; static int tar_umask = 002; +static gzFile gzip; + static int write_tar_filter_archive(const struct archiver *ar, struct archiver_args *args); @@ -40,7 +42,10 @@ static int write_tar_filter_archive(const struct archiver *ar, /* writes out the whole block, or dies if fails */ static void write_block_or_die(const char *block) { - write_or_die(1, block, BLOCKSIZE); + if (!gzip) + write_or_die(1, block, BLOCKSIZE); + else if (gzwrite(gzip, block, (unsigned) BLOCKSIZE) != BLOCKSIZE) + die(_("gzwrite failed")); } /* writes out the whole block, but only if it is full */ @@ -466,18 +471,37 @@ static int write_tar_filter_archive(const struct archiver *ar, filter.use_shell = 1; filter.in = -1; - if (start_command(&filter) < 0) - die_errno(_("unable to start '%s' filter"), argv[0]); - close(1); - if (dup2(filter.in, 1) < 0) - die_errno(_("unable to redirect descriptor")); - close(filter.in); + if (!strcmp(":internal-gzip:", ar->data)) { + gzip = gzdopen(fileno(stdout), "wb"); + if (!gzip) + die(_("Could not gzdopen stdout")); + if (args->compression_level >= 0 && + gzsetparams(gzip, args->compression_level, + Z_DEFAULT_STRATEGY) != Z_OK) + die(_("unable to set compression level %d"), + args->compression_level); + } else { + if (start_command(&filter) < 0) + die_errno(_("unable to start '%s' filter"), argv[0]); + close(1); + if (dup2(filter.in, 1) < 0) + die_errno(_("unable to redirect descriptor")); + close(filter.in); + } r = write_tar_archive(ar, args); - close(1); - if (finish_command(&filter) != 0) - die(_("'%s' filter reported error"), argv[0]); + if (gzip) { + int ret = gzclose(gzip); + if (ret == Z_ERRNO) + die_errno(_("gzclose failed")); + else if (ret != Z_OK) + die(_("gzclose failed (%d)"), ret); + } else { + close(1); + if (finish_command(&filter) != 0) + die(_("'%s' filter reported error"), argv[0]); + } strbuf_release(&cmd); return r; From ead96e8167558bb69fec67a5c44cd658c27903d4 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Fri, 26 Apr 2019 09:38:41 -0400 Subject: [PATCH 4/6] archive: use the internal zlib-based gzip compression by default We just introduced support for compressing `.tar.gz` archives in the `git archive` process itself, using zlib directly instead of spawning `gzip`. While this takes less CPU time overall, on multi-core machines, this is slightly slower in terms of wall clock time (it seems to be in the ballpark of 15%). It does reduce the number of dependencies by one, though, which makes it desirable to turn that mode on by default. Changing the default benefits most notably the MinGit flavor of Git for Windows (which intends to support 3rd-party applications that want to use Git and want to bundle a minimal set of files for that purpose, i.e. stripping out all non-essential files such as interactive commands, Perl, and yes, also `gzip`). We also can now remove the `GZIP` prerequisite from quite a number of test cases in `t/t5000-tar-tree.sh`. This closes https://github.com/git-for-windows/git/issues/1970 Signed-off-by: Johannes Schindelin --- Documentation/git-archive.txt | 3 ++- archive-tar.c | 4 ++-- t/t5000-tar-tree.sh | 18 ++++++++++++------ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/Documentation/git-archive.txt b/Documentation/git-archive.txt index cfa1e4ebe4860d..76e621491fc4d0 100644 --- a/Documentation/git-archive.txt +++ b/Documentation/git-archive.txt @@ -116,7 +116,8 @@ tar..command:: format is given. + The "tar.gz" and "tgz" formats are defined automatically and default to -`gzip -cn`. You may override them with custom commands. +`:internal-gzip:`, triggering an in-process gzip compression. You may +override them with custom commands, e.g. `gzip -cn` or `pigz -cn`. tar..remote:: If true, enable `` for use by remote clients via diff --git a/archive-tar.c b/archive-tar.c index 19b502d8a40f1f..aa5820646f31f3 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -518,9 +518,9 @@ void init_tar_archiver(void) int i; register_archiver(&tar_archiver); - tar_filter_config("tar.tgz.command", "gzip -cn", NULL); + tar_filter_config("tar.tgz.command", ":internal-gzip:", NULL); tar_filter_config("tar.tgz.remote", "true", NULL); - tar_filter_config("tar.tar.gz.command", "gzip -cn", NULL); + tar_filter_config("tar.tar.gz.command", ":internal-gzip:", NULL); tar_filter_config("tar.tar.gz.remote", "true", NULL); git_config(git_tar_config, NULL); for (i = 0; i < nr_tar_filters; i++) { diff --git a/t/t5000-tar-tree.sh b/t/t5000-tar-tree.sh index 37655a237cb783..ed0eb713752c82 100755 --- a/t/t5000-tar-tree.sh +++ b/t/t5000-tar-tree.sh @@ -298,36 +298,42 @@ test_expect_success 'only enabled filters are available remotely' ' test_cmp_bin remote.bar config.bar ' -test_expect_success GZIP 'git archive --format=tgz' ' +test_expect_success 'git archive --format=tgz' ' git archive --format=tgz HEAD >j.tgz ' -test_expect_success GZIP 'git archive --format=tar.gz' ' +test_expect_success 'git archive --format=tar.gz' ' git archive --format=tar.gz HEAD >j1.tar.gz && test_cmp_bin j.tgz j1.tar.gz ' -test_expect_success GZIP 'infer tgz from .tgz filename' ' +test_expect_success 'infer tgz from .tgz filename' ' git archive --output=j2.tgz HEAD && test_cmp_bin j.tgz j2.tgz ' -test_expect_success GZIP 'infer tgz from .tar.gz filename' ' +test_expect_success 'infer tgz from .tar.gz filename' ' git archive --output=j3.tar.gz HEAD && test_cmp_bin j.tgz j3.tar.gz ' +test_expect_success 'use `archive.tgz.command=:internal-gzip:` explicitly' ' + git -c archive.tgz.command=:internal-gzip: archive --output=j4.tgz \ + HEAD && + test_cmp_bin j.tgz j4.tgz +' + test_expect_success GZIP 'extract tgz file' ' gzip -d -c j.tar && test_cmp_bin b.tar j.tar ' -test_expect_success GZIP 'remote tar.gz is allowed by default' ' +test_expect_success 'remote tar.gz is allowed by default' ' git archive --remote=. --format=tar.gz HEAD >remote.tar.gz && test_cmp_bin j.tgz remote.tar.gz ' -test_expect_success GZIP 'remote tar.gz can be disabled' ' +test_expect_success 'remote tar.gz can be disabled' ' git config tar.tar.gz.remote false && test_must_fail git archive --remote=. --format=tar.gz HEAD \ >remote.tar.gz From 677045904521925fa54f3ff249dd8b540010de59 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sun, 16 Feb 2020 23:42:55 +0100 Subject: [PATCH 5/6] archive (internal gzip): compress in a separate thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than compressing the file contents in the same thread as unpacking the objects from Git's object database, we can distribute the load onto two threads, speeding up the operation a bit in the process. Note: to allow forcing single-threaded operations, this mode now respects the `pack.threads` setting. As to the speed impact, with zlib v1.2.11, the extra thread seems to be slightly faster than the version without the extra thread, but it seems to be still about 20% slower than spawning `gzip` in this developer's setup. However, as per a recent mail on the zlib mailing list (see http://madler.net/pipermail/zlib-devel_madler.net/2019-December/003308.html for details), the situation will (hopefully) improve soon-ish. Patches provided by Intel at https://github.com/jtkukunas/zlib, for example, seem to reverse the picture in this developer's setup: even the single-threaded `:internal-gzip:` is now about 5% faster than the mode spawning `gzip`. Initial-patch-by: René Scharfe Signed-off-by: Johannes Schindelin --- archive-tar.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/archive-tar.c b/archive-tar.c index aa5820646f31f3..53bca898dc6607 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -40,10 +40,12 @@ static int write_tar_filter_archive(const struct archiver *ar, #define USTAR_MAX_MTIME 077777777777ULL #endif +static int out_fd = 1, nr_threads = 0; + /* writes out the whole block, or dies if fails */ static void write_block_or_die(const char *block) { if (!gzip) - write_or_die(1, block, BLOCKSIZE); + write_or_die(out_fd, block, BLOCKSIZE); else if (gzwrite(gzip, block, (unsigned) BLOCKSIZE) != BLOCKSIZE) die(_("gzwrite failed")); } @@ -425,6 +427,13 @@ static int tar_filter_config(const char *var, const char *value, void *data) static int git_tar_config(const char *var, const char *value, void *cb) { + if (!strcmp(var, "pack.threads")) { + nr_threads = git_config_int(var, value); + if (nr_threads < 0) + nr_threads = 1; /* fall back to single-threaded */ + return 0; + } + if (!strcmp(var, "tar.umask")) { if (value && !strcmp(value, "user")) { tar_umask = umask(0); @@ -450,6 +459,31 @@ static int write_tar_archive(const struct archiver *ar, return err; } +static int internal_gzip(int in, int out, void *data) +{ + gzip = gzdopen(1, "wb"); + if (!gzip) + return error(_("gzdopen failed")); + if (gzsetparams(gzip, *(int *)data, Z_DEFAULT_STRATEGY) != Z_OK) + return error(_("unable to set compression level")); + + for (;;) { + char buf[BLOCKSIZE]; + ssize_t read = xread(in, buf, sizeof(buf)); + if (read < 0) + die_errno(_("read failed")); + if (read == 0) + break; + if (gzwrite(gzip, buf, read) != read) + die(_("gzwrite failed")); + } + + close(in); + if (gzclose(gzip) != Z_OK) + return error(_("gzclose failed")); + return 0; +} + static int write_tar_filter_archive(const struct archiver *ar, struct archiver_args *args) { @@ -461,6 +495,28 @@ static int write_tar_filter_archive(const struct archiver *ar, if (!ar->data) BUG("tar-filter archiver called with no filter defined"); + if (!strcmp(ar->data, ":internal-gzip:") && + /* use separate thread? */ + (nr_threads > 1 || (nr_threads == 0 && online_cpus() > 1))) { + struct async filter = { + .proc = internal_gzip, + .data = &args->compression_level, + .in = -1 + }; + + if (start_async(&filter)) + return error(_("unable to fork off internal gzip")); + out_fd = filter.in; + + r = write_tar_archive(ar, args); + + close(out_fd); + if (finish_async(&filter)) + return error(_("error in internal gzip")); + + return r; + } + strbuf_addstr(&cmd, ar->data); if (args->compression_level >= 0) strbuf_addf(&cmd, " -%d", args->compression_level); From 409bd96db11cf614c3ccbd5ff98910f5b059d76a Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Fri, 21 Feb 2020 15:53:16 +0100 Subject: [PATCH 6/6] DEBUG Signed-off-by: Johannes Schindelin --- t/t5000-tar-tree.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/t/t5000-tar-tree.sh b/t/t5000-tar-tree.sh index ed0eb713752c82..8ad6227e6e5242 100755 --- a/t/t5000-tar-tree.sh +++ b/t/t5000-tar-tree.sh @@ -215,6 +215,7 @@ test_expect_success 'git archive with --output, override inferred format' ' test_expect_success GZIP 'git archive with --output and --remote creates .tgz' ' git archive --output=d5.tgz --remote=. HEAD && + ls -l d5.tgz && gzip -d -c d5.tar && test_cmp_bin b.tar d5.tar '