From 2d7bb64acfcd84d43e02a796aad464059ff49eec Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 24 Apr 2019 08:44:09 -0700 Subject: [PATCH 1/5] t9350: fix encoding test to actually test reencoding This test used an author with non-ascii characters in the name, but no special commit message. It then grep'ed for those non-ascii characters, but those are guaranteed to exist regardless of the reencoding process since the reencoding only affects the commit message, not the author or committer names. As such, the test would work even if the re-encoding process simply stripped the commit message entirely. Modify the test to actually check that the reencoding in utf-8 worked. Signed-off-by: Elijah Newren --- t/t9350-fast-export.sh | 29 +++++++++++++------- t/t9350/simple-iso-8859-7-commit-message.txt | 1 + 2 files changed, 20 insertions(+), 10 deletions(-) create mode 100644 t/t9350/simple-iso-8859-7-commit-message.txt diff --git a/t/t9350-fast-export.sh b/t/t9350-fast-export.sh index 5690fe28106624..ef9b1aa20b2401 100755 --- a/t/t9350-fast-export.sh +++ b/t/t9350-fast-export.sh @@ -94,22 +94,32 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' ' test $MUSS = $(git rev-parse --verify refs/tags/muss) ' -test_expect_success 'iso-8859-1' ' +test_expect_success 'iso-8859-7' ' - git config i18n.commitencoding ISO8859-1 && - # use author and committer name in ISO-8859-1 to match it. - . "$TEST_DIRECTORY"/t3901/8859-1.txt && + test_when_finished "git reset --hard HEAD~1" && + test_config i18n.commitencoding iso-8859-7 && test_tick && echo rosten >file && - git commit -s -m den file && - git fast-export wer^..wer >iso8859-1.fi && - sed "s/wer/i18n/" iso8859-1.fi | + git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file && + git fast-export wer^..wer >iso-8859-7.fi && + sed "s/wer/i18n/" iso-8859-7.fi | (cd new && git fast-import && + # The commit object, if not re-encoded, would be 240 bytes. + # Removing the "encoding iso-8859-7\n" header drops 20 bytes. + # Re-encoding the Pi character from \xF0 in iso-8859-7 to + # \xCF\x80 in utf-8 adds a byte. Grepping for specific bytes + # would be nice, but Windows apparently munges user data + # in the form of bytes on the command line to force them to + # be characters instead, so we are limited for portability + # reasons in subsequent similar tests in this file to check + # for size rather than what bytes are present. + test 221 -eq "$(git cat-file -s i18n)" && + # Also make sure the commit does not have the "encoding" header git cat-file commit i18n >actual && - grep "Áéí óú" actual) - + ! grep ^encoding actual) ' + test_expect_success 'import/export-marks' ' git checkout -b marks master && @@ -224,7 +234,6 @@ GIT_COMMITTER_NAME='C O Mitter'; export GIT_COMMITTER_NAME test_expect_success 'setup copies' ' - git config --unset i18n.commitencoding && git checkout -b copy rein && git mv file file3 && git commit -m move1 && diff --git a/t/t9350/simple-iso-8859-7-commit-message.txt b/t/t9350/simple-iso-8859-7-commit-message.txt new file mode 100644 index 00000000000000..8b3f0c3dba3f0a --- /dev/null +++ b/t/t9350/simple-iso-8859-7-commit-message.txt @@ -0,0 +1 @@ +Pi: � \ No newline at end of file From 9fa5695017f4ed4b1fbd095106571cbd73cc6632 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 24 Apr 2019 13:42:12 -0700 Subject: [PATCH 2/5] fast-import: support 'encoding' commit header Since git supports commit messages with an encoding other than utf-8, allow fast-import to import such commits. This may be useful for folks who do not want to reencode commit messages from an external system, and may also be useful to achieve reversible history rewrites (e.g. sha1sum <-> sha256sum transitions or subtree work) with git repositories that have used specialized encodings in their commit history. Signed-off-by: Elijah Newren --- Documentation/git-fast-import.txt | 7 +++++++ fast-import.c | 11 +++++++++-- t/t9300-fast-import.sh | 20 ++++++++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt index d65cdb3d08fd74..7baf9e47b5e613 100644 --- a/Documentation/git-fast-import.txt +++ b/Documentation/git-fast-import.txt @@ -388,6 +388,7 @@ change to the project. original-oid? ('author' (SP )? SP LT GT SP LF)? 'committer' (SP )? SP LT GT SP LF + ('encoding' SP )? data ('from' SP LF)? ('merge' SP LF)? @@ -455,6 +456,12 @@ that was selected by the --date-format= command-line option. See ``Date Formats'' above for the set of supported formats, and their syntax. +`encoding` +^^^^^^^^^^ +The optional `encoding` command indicates the encoding of the commit +message. Most commits are UTF-8 and the encoding is omitted, but this +allows importing commit messages into git without first reencoding them. + `from` ^^^^^^ The `from` command is used to specify the commit to initialize diff --git a/fast-import.c b/fast-import.c index f38d04fa58510b..76a7bd369987f7 100644 --- a/fast-import.c +++ b/fast-import.c @@ -2585,6 +2585,7 @@ static void parse_new_commit(const char *arg) struct branch *b; char *author = NULL; char *committer = NULL; + const char *encoding = NULL; struct hash_list *merge_list = NULL; unsigned int merge_count; unsigned char prev_fanout, new_fanout; @@ -2607,6 +2608,8 @@ static void parse_new_commit(const char *arg) } if (!committer) die("Expected committer but didn't get one"); + if (skip_prefix(command_buf.buf, "encoding ", &encoding)) + read_next_command(); parse_data(&msg, 0, NULL); read_next_command(); parse_from(b); @@ -2670,9 +2673,13 @@ static void parse_new_commit(const char *arg) } strbuf_addf(&new_data, "author %s\n" - "committer %s\n" - "\n", + "committer %s\n", author ? author : committer, committer); + if (encoding) + strbuf_addf(&new_data, + "encoding %s\n", + encoding); + strbuf_addch(&new_data, '\n'); strbuf_addbuf(&new_data, &msg); free(author); free(committer); diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh index 3668263c4046d9..141b7fa35e74b8 100755 --- a/t/t9300-fast-import.sh +++ b/t/t9300-fast-import.sh @@ -3299,4 +3299,24 @@ test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi sed -e s/LFs/LLL/ W-input | tr L "\n" | test_must_fail git fast-import ' +### +### series X (other new features) +### + +test_expect_success 'X: handling encoding' ' + test_tick && + cat >input <<-INPUT_END && + commit refs/heads/encoding + committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE + encoding iso-8859-7 + data <>input && + + git fast-import Date: Wed, 24 Apr 2019 14:29:28 -0700 Subject: [PATCH 3/5] fast-export: avoid stripping encoding header if we cannot reencode When fast-export encounters a commit with an 'encoding' header, it tries to reencode in utf-8 and then drops the encoding header. However, if it fails to reencode in utf-8 because e.g. one of the characters in the commit message was invalid in the old encoding, then we need to retain the original encoding or otherwise we lose information needed to understand all the other (valid) characters in the original commit message. Signed-off-by: Elijah Newren --- builtin/fast-export.c | 7 +++++-- t/t9350-fast-export.sh | 17 +++++++++++++++++ t/t9350/broken-iso-8859-7-commit-message.txt | 1 + 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 t/t9350/broken-iso-8859-7-commit-message.txt diff --git a/builtin/fast-export.c b/builtin/fast-export.c index 9e283482efcfa6..7734a9f5a5df4f 100644 --- a/builtin/fast-export.c +++ b/builtin/fast-export.c @@ -642,9 +642,12 @@ static void handle_commit(struct commit *commit, struct rev_info *rev, printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum); if (show_original_ids) printf("original-oid %s\n", oid_to_hex(&commit->object.oid)); - printf("%.*s\n%.*s\ndata %u\n%s", + printf("%.*s\n%.*s\n", (int)(author_end - author), author, - (int)(committer_end - committer), committer, + (int)(committer_end - committer), committer); + if (!reencoded && encoding) + printf("encoding %s\n", encoding); + printf("data %u\n%s", (unsigned)(reencoded ? strlen(reencoded) : message ? strlen(message) : 0), diff --git a/t/t9350-fast-export.sh b/t/t9350-fast-export.sh index ef9b1aa20b2401..fa124f4842a784 100755 --- a/t/t9350-fast-export.sh +++ b/t/t9350-fast-export.sh @@ -120,6 +120,23 @@ test_expect_success 'iso-8859-7' ' ! grep ^encoding actual) ' +test_expect_success 'encoding preserved if reencoding fails' ' + + test_when_finished "git reset --hard HEAD~1" && + test_config i18n.commitencoding iso-8859-7 && + echo rosten >file && + git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file && + git fast-export wer^..wer >iso-8859-7.fi && + sed "s/wer/i18n-invalid/" iso-8859-7.fi | + (cd new && + git fast-import && + git cat-file commit i18n-invalid >actual && + grep ^encoding actual && + # Also verify that the commit has the expected size; i.e. + # that no bytes were re-encoded to a different encoding. + test 252 -eq "$(git cat-file -s i18n-invalid)") +' + test_expect_success 'import/export-marks' ' git checkout -b marks master && diff --git a/t/t9350/broken-iso-8859-7-commit-message.txt b/t/t9350/broken-iso-8859-7-commit-message.txt new file mode 100644 index 00000000000000..d06ad75b448260 --- /dev/null +++ b/t/t9350/broken-iso-8859-7-commit-message.txt @@ -0,0 +1 @@ +Pi: �; Invalid: � \ No newline at end of file From 83b3656b769c9be92748a9ab3414ef8a3b325155 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 24 Apr 2019 14:38:31 -0700 Subject: [PATCH 4/5] fast-export: differentiate between explicitly utf-8 and implicitly utf-8 The find_encoding() function returned the encoding used by a commit message, returning a default of git_commit_encoding (usually utf-8). Although the current code does not differentiate between a commit which explicitly requested utf-8 and one where we just assume utf-8 because no encoding is set, it will become important when we try to preserve the encoding header. Since is_encoding_utf8() returns true when passed NULL, we can just return NULL from find_encoding() instead of returning git_commit_encoding. Signed-off-by: Elijah Newren --- builtin/fast-export.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin/fast-export.c b/builtin/fast-export.c index 7734a9f5a5df4f..66331fa4010c6c 100644 --- a/builtin/fast-export.c +++ b/builtin/fast-export.c @@ -453,7 +453,7 @@ static const char *find_encoding(const char *begin, const char *end) bol = memmem(begin, end ? end - begin : strlen(begin), needle, strlen(needle)); if (!bol) - return git_commit_encoding; + return NULL; bol += strlen(needle); eol = strchrnul(bol, '\n'); *eol = '\0'; From 2063122293e97472f8afba4037f65ba0267a1e02 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 24 Apr 2019 08:50:55 -0700 Subject: [PATCH 5/5] fast-export: do automatic reencoding of commit messages only if requested Automatic re-encoding of commit messages (and dropping of the encoding header) hurts attempts to do reversible history rewrites (e.g. sha1sum <-> sha256sum transitions, some subtree rewrites), and seems inconsistent with the general principle followed elsewhere in fast-export of requiring explicit user requests to modify the output (e.g. --signed-tags=strip, --tag-of-filtered-object=rewrite). Add a --reencode flag that the user can use to specify, and like other fast-export flags, default it to 'abort'. Signed-off-by: Elijah Newren --- builtin/fast-export.c | 35 ++++++++++++++++++++++++++++++++--- t/t9350-fast-export.sh | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 66 insertions(+), 6 deletions(-) diff --git a/builtin/fast-export.c b/builtin/fast-export.c index 66331fa4010c6c..43cc52331cea68 100644 --- a/builtin/fast-export.c +++ b/builtin/fast-export.c @@ -33,6 +33,7 @@ static const char *fast_export_usage[] = { static int progress; static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT; static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT; +static enum { REENCODE_ABORT, REENCODE_PLEASE, REENCODE_NEVER } reencode_mode = REENCODE_ABORT; static int fake_missing_tagger; static int use_done_feature; static int no_data; @@ -77,6 +78,20 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt, return 0; } +static int parse_opt_reencode_mode(const struct option *opt, + const char *arg, int unset) +{ + if (unset || !strcmp(arg, "abort")) + reencode_mode = REENCODE_ABORT; + else if (!strcmp(arg, "yes")) + reencode_mode = REENCODE_PLEASE; + else if (!strcmp(arg, "no")) + reencode_mode = REENCODE_NEVER; + else + return error("Unknown reencoding mode: %s", arg); + return 0; +} + static struct decoration idnums; static uint32_t last_idnum; @@ -633,10 +648,21 @@ static void handle_commit(struct commit *commit, struct rev_info *rev, } mark_next_object(&commit->object); - if (anonymize) + if (anonymize) { reencoded = anonymize_commit_message(message); - else if (!is_encoding_utf8(encoding)) - reencoded = reencode_string(message, "UTF-8", encoding); + } else if (encoding) { + switch(reencode_mode) { + case REENCODE_PLEASE: + reencoded = reencode_string(message, "UTF-8", encoding); + break; + case REENCODE_NEVER: + break; + case REENCODE_ABORT: + die("Encountered commit-specific encoding %s in commit " + "%s; use --reencode= to handle it", + encoding, oid_to_hex(&commit->object.oid)); + } + } if (!commit->parents) printf("reset %s\n", refname); printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum); @@ -1091,6 +1117,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix) OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"), N_("select handling of tags that tag filtered objects"), parse_opt_tag_of_filtered_mode), + OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"), + N_("select handling of commit messages in an alternate encoding"), + parse_opt_reencode_mode), OPT_STRING(0, "export-marks", &export_filename, N_("file"), N_("Dump marks to this file")), OPT_STRING(0, "import-marks", &import_filename, N_("file"), diff --git a/t/t9350-fast-export.sh b/t/t9350-fast-export.sh index fa124f4842a784..3cf4c7fc0cf1f2 100755 --- a/t/t9350-fast-export.sh +++ b/t/t9350-fast-export.sh @@ -94,14 +94,14 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' ' test $MUSS = $(git rev-parse --verify refs/tags/muss) ' -test_expect_success 'iso-8859-7' ' +test_expect_success 'reencoding iso-8859-7' ' test_when_finished "git reset --hard HEAD~1" && test_config i18n.commitencoding iso-8859-7 && test_tick && echo rosten >file && git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file && - git fast-export wer^..wer >iso-8859-7.fi && + git fast-export --reencode=yes wer^..wer >iso-8859-7.fi && sed "s/wer/i18n/" iso-8859-7.fi | (cd new && git fast-import && @@ -120,13 +120,44 @@ test_expect_success 'iso-8859-7' ' ! grep ^encoding actual) ' +test_expect_success 'aborting on iso-8859-7' ' + + test_when_finished "git reset --hard HEAD~1" && + test_config i18n.commitencoding iso-8859-7 && + echo rosten >file && + git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file && + test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi +' + +test_expect_success 'preserving iso-8859-7' ' + + test_when_finished "git reset --hard HEAD~1" && + test_config i18n.commitencoding iso-8859-7 && + echo rosten >file && + git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file && + git fast-export --reencode=no wer^..wer >iso-8859-7.fi && + sed "s/wer/i18n-no-recoding/" iso-8859-7.fi | + (cd new && + git fast-import && + # The commit object, if not re-encoded, is 240 bytes. + # Removing the "encoding iso-8859-7\n" header would drops 20 + # bytes. Re-encoding the Pi character from \xF0 in + # iso-8859-7 to \xCF\x80 in utf-8 would add a byte. I would + # grep for the # specific bytes, but Windows lamely does not + # allow that, so just search for the expected size. + test 240 -eq "$(git cat-file -s i18n-no-recoding)" && + # Also make sure the commit has the "encoding" header + git cat-file commit i18n-no-recoding >actual && + grep ^encoding actual) +' + test_expect_success 'encoding preserved if reencoding fails' ' test_when_finished "git reset --hard HEAD~1" && test_config i18n.commitencoding iso-8859-7 && echo rosten >file && git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file && - git fast-export wer^..wer >iso-8859-7.fi && + git fast-export --reencode=yes wer^..wer >iso-8859-7.fi && sed "s/wer/i18n-invalid/" iso-8859-7.fi | (cd new && git fast-import &&