Skip to content

Commit a4c2d5a

Browse files
committed
midx: implement midx_repack()
To repack using a multi-pack-index, first sort all pack-files by their modified time. Second, walk those pack-files from oldest to newest, adding the packs to a list if they are smaller than the given pack-size. Finally, collect the objects from the multi-pack- index that are in those packs and send them to 'git pack-objects'. While first designing a 'git multi-pack-index repack' operation, I started by collecting the batches based on the size of the objects instead of the size of the pack-files. This allows repacking a large pack-file that has very few referencd objects. However, this came at a significant cost of parsing pack-files instead of simply reading the multi-pack-index and getting the file information for the pack-files. This object-size idea could be a direction for future expansion in this area. Signed-off-by: Derrick Stolee <[email protected]>
1 parent b39f90a commit a4c2d5a

File tree

2 files changed

+133
-1
lines changed

2 files changed

+133
-1
lines changed

midx.c

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "sha1-lookup.h"
99
#include "midx.h"
1010
#include "progress.h"
11+
#include "run-command.h"
1112

1213
#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
1314
#define MIDX_VERSION 1
@@ -1107,7 +1108,113 @@ int expire_midx_packs(const char *object_dir)
11071108
return result;
11081109
}
11091110

1110-
int midx_repack(const char *object_dir, size_t batch_size)
1111+
struct time_and_id {
1112+
timestamp_t mtime;
1113+
uint32_t pack_int_id;
1114+
};
1115+
1116+
static int compare_by_mtime(const void *a_, const void *b_)
11111117
{
1118+
const struct time_and_id *a, *b;
1119+
1120+
a = (const struct time_and_id *)a_;
1121+
b = (const struct time_and_id *)b_;
1122+
1123+
if (a->mtime < b->mtime)
1124+
return -1;
1125+
if (a->mtime > b->mtime)
1126+
return 1;
11121127
return 0;
11131128
}
1129+
1130+
int midx_repack(const char *object_dir, size_t batch_size)
1131+
{
1132+
int result = 0;
1133+
uint32_t i, packs_to_repack;
1134+
size_t total_size;
1135+
struct time_and_id *pack_ti;
1136+
unsigned char *include_pack;
1137+
struct child_process cmd = CHILD_PROCESS_INIT;
1138+
struct strbuf base_name = STRBUF_INIT;
1139+
struct multi_pack_index *m = load_multi_pack_index(object_dir, 1);
1140+
1141+
if (!m)
1142+
return 0;
1143+
1144+
include_pack = xcalloc(m->num_packs, sizeof(unsigned char));
1145+
pack_ti = xcalloc(m->num_packs, sizeof(struct time_and_id));
1146+
1147+
for (i = 0; i < m->num_packs; i++) {
1148+
pack_ti[i].pack_int_id = i;
1149+
1150+
if (prepare_midx_pack(m, i))
1151+
continue;
1152+
1153+
pack_ti[i].mtime = m->packs[i]->mtime;
1154+
}
1155+
QSORT(pack_ti, m->num_packs, compare_by_mtime);
1156+
1157+
total_size = 0;
1158+
packs_to_repack = 0;
1159+
for (i = 0; total_size < batch_size && i < m->num_packs; i++) {
1160+
int pack_int_id = pack_ti[i].pack_int_id;
1161+
struct packed_git *p = m->packs[pack_int_id];
1162+
1163+
if (!p)
1164+
continue;
1165+
if (p->pack_size >= batch_size)
1166+
continue;
1167+
1168+
packs_to_repack++;
1169+
total_size += p->pack_size;
1170+
include_pack[pack_int_id] = 1;
1171+
}
1172+
1173+
if (total_size < batch_size || packs_to_repack < 2)
1174+
goto cleanup;
1175+
1176+
argv_array_push(&cmd.args, "pack-objects");
1177+
1178+
strbuf_addstr(&base_name, object_dir);
1179+
strbuf_addstr(&base_name, "/pack/pack");
1180+
argv_array_push(&cmd.args, base_name.buf);
1181+
strbuf_release(&base_name);
1182+
1183+
cmd.git_cmd = 1;
1184+
cmd.in = cmd.out = -1;
1185+
1186+
if (start_command(&cmd)) {
1187+
error(_("could not start pack-objects"));
1188+
result = 1;
1189+
goto cleanup;
1190+
}
1191+
1192+
for (i = 0; i < m->num_objects; i++) {
1193+
struct object_id oid;
1194+
uint32_t pack_int_id = nth_midxed_pack_int_id(m, i);
1195+
1196+
if (!include_pack[pack_int_id])
1197+
continue;
1198+
1199+
nth_midxed_object_oid(&oid, m, i);
1200+
xwrite(cmd.in, oid_to_hex(&oid), the_hash_algo->hexsz);
1201+
xwrite(cmd.in, "\n", 1);
1202+
}
1203+
close(cmd.in);
1204+
1205+
if (finish_command(&cmd)) {
1206+
error(_("could not finish pack-objects"));
1207+
result = 1;
1208+
goto cleanup;
1209+
}
1210+
1211+
result = write_midx_internal(object_dir, m, NULL);
1212+
m = NULL;
1213+
1214+
cleanup:
1215+
if (m)
1216+
close_midx(m);
1217+
free(include_pack);
1218+
free(pack_ti);
1219+
return result;
1220+
}

t/t5319-multi-pack-index.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,4 +424,29 @@ test_expect_success 'repack with minimum size does not alter existing packs' '
424424
)
425425
'
426426

427+
test_expect_success 'repack creates a new pack' '
428+
(
429+
cd dup &&
430+
SECOND_SMALLEST_SIZE=$(ls -l .git/objects/pack/*pack | awk "{print \$5;}" | sort -n | head -n 2 | tail -n 1) &&
431+
BATCH_SIZE=$(($SECOND_SMALLEST_SIZE + 1)) &&
432+
git multi-pack-index repack --batch-size=$BATCH_SIZE &&
433+
ls .git/objects/pack/*idx >idx-list &&
434+
test_line_count = 5 idx-list &&
435+
test-tool read-midx .git/objects | grep idx >midx-list &&
436+
test_line_count = 5 midx-list
437+
)
438+
'
439+
440+
test_expect_success 'expire removes repacked packs' '
441+
(
442+
cd dup &&
443+
ls -S .git/objects/pack/*pack | head -n 3 >expect &&
444+
git multi-pack-index expire &&
445+
ls -S .git/objects/pack/*pack >actual &&
446+
test_cmp expect actual &&
447+
test-tool read-midx .git/objects | grep idx >midx-list &&
448+
test_line_count = 3 midx-list
449+
)
450+
'
451+
427452
test_done

0 commit comments

Comments
 (0)