diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index 473a16135abf86..dffd463eb39c11 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -236,6 +236,23 @@ So does `git bundle` (see linkgit:git-bundle[1]) when it creates a bundle. With this option, parents that are hidden by grafts are packed nevertheless. +--filter-omit-all-blobs:: + Requires `--stdout`. Omits all blobs from the packfile. + +--filter-omit-large-blobs=[kmg]:: + Requires `--stdout`. Omits large blobs larger than n bytes from + the packfile. May optionally be followed by 'k', 'm', or 'g' units. + Value may be zero. Special files (matching ".git*") are always + included, regardless of size. + +--filter-use-blob=:: +--filter-use-path=:: + Requires `--stdout`. Use a sparse-checkout specification to + filter the resulting packfile to only contain the blobs that + would be referenced by such a sparse-checkout. `` specifies + a local pathname. `` specifies an expression that can + be evaluated to a blob. + SEE ALSO -------- linkgit:git-rev-list[1] diff --git a/Documentation/git-rev-list.txt b/Documentation/git-rev-list.txt index ef22f1775b6348..b2e825521316d4 100644 --- a/Documentation/git-rev-list.txt +++ b/Documentation/git-rev-list.txt @@ -47,7 +47,14 @@ SYNOPSIS [ --fixed-strings | -F ] [ --date=] [ [ --objects | --objects-edge | --objects-edge-aggressive ] - [ --unpacked ] ] + [ --unpacked ] + [ [ --filter-omit-all-blobs | + --filter-omit-large-blobs=[kmg] | + --filter-use-blob= | + --filter-use-path= ] + [ --filter-print-missing ] + [ --filter-print-omitted ] ] ] + [ --filter-relax ] [ --pretty | --header ] [ --bisect ] [ --bisect-vars ] diff --git a/Documentation/rev-list-options.txt b/Documentation/rev-list-options.txt index 7d860bfca1442e..7b79a999ef10e7 100644 --- a/Documentation/rev-list-options.txt +++ b/Documentation/rev-list-options.txt @@ -706,6 +706,38 @@ ifdef::git-rev-list[] --unpacked:: Only useful with `--objects`; print the object IDs that are not in packs. + +--filter-omit-all-blobs:: + Only useful with one of the `--objects*`; omits all blobs from + the printed list of objects. + +--filter-omit-large-blobs=[kmg]:: + Only useful with one of the `--objects*`; omits blobs larger than + n bytes from the printed list of objects. May optionally be + followed by 'k', 'm', or 'g' units. Value may be zero. Special + files (matching ".git*") are always included, regardless of size. + +--filter-use-blob=:: +--filter-use-path=:: + Only useful with one of the `--objects*`; uses a sparse-checkout + specification contained in the given object or file to filter the + result to only contain blobs referenced by such a sparse-checkout. + +--filter-print-missing:: + Prints a list of the missing objects for the requested traversal. + Object IDs are prefixed with a ``?'' character. The object type + is printed after the ID. This may be used with or without any of + the above filtering options. + +--filter-print-omitted:: + Only useful with one of the above `--filter*`; prints a list + of the omitted objects. Object IDs are prefixed with a ``~'' + character. The object size is printed after the ID. + +--filter-relax:: + Relax consistency checking for missing blobs. Do not warn of + missing blobs during normal (non-filtering) object traversal + following an earlier partial/narrow clone or fetch. endif::git-rev-list[] --no-walk[=(sorted|unsorted)]:: diff --git a/Makefile b/Makefile index ed4ca438bd9c6d..5b89bd99b8a4f8 100644 --- a/Makefile +++ b/Makefile @@ -804,6 +804,9 @@ LIB_OBJS += levenshtein.o LIB_OBJS += line-log.o LIB_OBJS += line-range.o LIB_OBJS += list-objects.o +LIB_OBJS += list-objects-filter-all.o +LIB_OBJS += list-objects-filter-large.o +LIB_OBJS += list-objects-filter-sparse.o LIB_OBJS += ll-merge.o LIB_OBJS += lockfile.o LIB_OBJS += log-tree.o @@ -821,7 +824,9 @@ LIB_OBJS += notes-cache.o LIB_OBJS += notes-merge.o LIB_OBJS += notes-utils.o LIB_OBJS += object.o +LIB_OBJS += object-filter.o LIB_OBJS += oidset.o +LIB_OBJS += oidset2.o LIB_OBJS += packfile.o LIB_OBJS += pack-bitmap.o LIB_OBJS += pack-bitmap-write.o diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index f721137eaf8814..0bc77700223ef1 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -79,6 +79,8 @@ static unsigned long cache_max_small_delta_size = 1000; static unsigned long window_memory_limit = 0; +static struct object_filter_options filter_options; + /* * stats */ @@ -2816,7 +2818,12 @@ static void get_object_list(int ac, const char **av) if (prepare_revision_walk(&revs)) die("revision walk setup failed"); mark_edges_uninteresting(&revs, show_edge); - traverse_commit_list(&revs, show_commit, show_object, NULL); + if (object_filter_enabled(&filter_options)) + traverse_commit_list_filtered(&filter_options, &revs, + show_commit, show_object, + NULL, NULL); + else + traverse_commit_list(&revs, show_commit, show_object, NULL); if (unpack_unreachable_expiration) { revs.ignore_missing_links = 1; @@ -2952,6 +2959,15 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) N_("use a bitmap index if available to speed up counting objects")), OPT_BOOL(0, "write-bitmap-index", &write_bitmap_index, N_("write a bitmap index together with the pack index")), + + OPT_PARSE_FILTER_OMIT_ALL_BLOBS(&filter_options), + OPT_PARSE_FILTER_OMIT_LARGE_BLOBS(&filter_options), + OPT_PARSE_FILTER_USE_BLOB(&filter_options), + OPT_PARSE_FILTER_USE_PATH(&filter_options), + /* not needed: OPT_PARSE_FILTER_PRINT_MISSING */ + /* not needed: OPT_PARSE_FILTER_PRINT_OMITTED */ + /* not needed: OPT_PARSE_FILTER_RELAX */ + OPT_END(), }; @@ -3028,6 +3044,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (!rev_list_all || !rev_list_reflog || !rev_list_index) unpack_unreachable_expiration = 0; + if (object_filter_enabled(&filter_options)) { + if (!pack_to_stdout) + die("cannot use filtering with an indexable pack."); + use_bitmap_index = 0; + } + /* * "soft" reasons not to use bitmaps - for on-disk repack by default we want * diff --git a/builtin/rev-list.c b/builtin/rev-list.c index c1c74d4a795643..3509cb376fc84b 100644 --- a/builtin/rev-list.c +++ b/builtin/rev-list.c @@ -54,6 +54,8 @@ static const char rev_list_usage[] = static struct progress *progress; static unsigned progress_counter; +static struct object_filter_options filter_options; +static struct oidset2 missing_objects; static void finish_commit(struct commit *commit, void *data); static void show_commit(struct commit *commit, void *data) @@ -181,8 +183,25 @@ static void finish_commit(struct commit *commit, void *data) static void finish_object(struct object *obj, const char *name, void *cb_data) { struct rev_list_info *info = cb_data; - if (obj->type == OBJ_BLOB && !has_object_file(&obj->oid)) + if (obj->type == OBJ_BLOB && !has_object_file(&obj->oid)) { + if (filter_options.print_missing) { + oidset2_insert(&missing_objects, &obj->oid, obj->type, + -1, name); + return; + } + if (filter_options.relax) { + /* + * Relax consistency checks to not complain about + * omitted objects (presumably caused by use of + * the previous use of the 'filter-objects' feature). + * + * Note that this is independent of any filtering that + * we are doing in this run. + */ + return; + } die("missing blob object '%s'", oid_to_hex(&obj->oid)); + } if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT) parse_object(&obj->oid); } @@ -202,6 +221,25 @@ static void show_edge(struct commit *commit) printf("-%s\n", oid_to_hex(&commit->object.oid)); } +static void print_omitted_object(int i, int i_limit, struct oidset2_entry *e, void *cb_data) +{ + /* struct rev_list_info *info = cb_data; */ + const char *tn = typename(e->type); + + if (e->object_length == -1) + printf("~%s %s\n", oid_to_hex(&e->oid), tn); + else + printf("~%s %s %"PRIuMAX"\n", oid_to_hex(&e->oid), tn, e->object_length); +} + +static void print_missing_object(int i, int i_limit, struct oidset2_entry *e, void *cb_data) +{ + /* struct rev_list_info *info = cb_data; */ + const char *tn = typename(e->type); + + printf("?%s %s\n", oid_to_hex(&e->oid), tn); +} + static void print_var_str(const char *var, const char *val) { printf("%s='%s'\n", var, val); @@ -335,6 +373,15 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix) show_progress = arg; continue; } + if (object_filter_hand_parse_arg( + &filter_options, arg, 1, 1, 1)) { + if (!revs.blob_objects) + die(_("object filtering requires --objects")); + if (filter_options.use_blob && + !oidcmp(&filter_options.sparse_oid, &null_oid)) + die(_("invalid sparse value")); + continue; + } usage(rev_list_usage); } @@ -360,6 +407,11 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix) if (revs.show_notes) die(_("rev-list does not support display of notes")); + if (object_filter_enabled(&filter_options)) { + if (use_bitmap_index) + die(_("cannot combine --use-bitmap-index with object filtering")); + } + save_commit_buffer = (revs.verbose_header || revs.grep_filter.pattern_list || revs.grep_filter.header_list); @@ -404,7 +456,24 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix) return show_bisect_vars(&info, reaches, all); } - traverse_commit_list(&revs, show_commit, show_object, &info); + if (filter_options.print_missing) + memset(&missing_objects, 0, sizeof(missing_objects)); + + if (object_filter_enabled(&filter_options)) + traverse_commit_list_filtered( + &filter_options, &revs, + show_commit, show_object, + (filter_options.print_omitted + ? print_omitted_object + : NULL), + &info); + else + traverse_commit_list(&revs, show_commit, show_object, &info); + + if (filter_options.print_missing) { + oidset2_foreach(&missing_objects, print_missing_object, &info); + oidset2_clear(&missing_objects); + } stop_progress(&progress); diff --git a/dir.c b/dir.c index 1c55dc3e366f8c..aca35d739f35e1 100644 --- a/dir.c +++ b/dir.c @@ -739,6 +739,11 @@ static void invalidate_directory(struct untracked_cache *uc, dir->dirs[i]->recurse = 0; } +static int add_excludes_from_buffer( + char *buf, size_t size, + const char *base, int baselen, + struct exclude_list *el); + /* * Given a file with name "fname", read it (either from disk, or from * an index if 'istate' is non-null), parse it and store the @@ -754,9 +759,9 @@ static int add_excludes(const char *fname, const char *base, int baselen, struct sha1_stat *sha1_stat) { struct stat st; - int fd, i, lineno = 1; + int fd; size_t size = 0; - char *buf, *entry; + char *buf; fd = open(fname, O_RDONLY); if (fd < 0 || fstat(fd, &st) < 0) { @@ -813,6 +818,18 @@ static int add_excludes(const char *fname, const char *base, int baselen, } } + add_excludes_from_buffer(buf, size, base, baselen, el); + return 0; +} + +static int add_excludes_from_buffer( + char *buf, size_t size, + const char *base, int baselen, + struct exclude_list *el) +{ + int i, lineno = 1; + char *entry; + el->filebuf = buf; if (skip_utf8_bom(&buf, size)) @@ -841,6 +858,38 @@ int add_excludes_from_file_to_list(const char *fname, const char *base, return add_excludes(fname, base, baselen, el, istate, NULL); } +int add_excludes_from_blob_to_list( + struct object_id *oid, + const char *base, int baselen, + struct exclude_list *el) +{ + char *buf; + unsigned long size; + enum object_type type; + + buf = read_sha1_file(oid->hash, &type, &size); + if (!buf) + return -1; + + if (type != OBJ_BLOB) { + free(buf); + return -1; + } + + if (size == 0) { + free(buf); + return 0; + } + + if (buf[size - 1] != '\n') { + buf = xrealloc(buf, st_add(size, 1)); + buf[size++] = '\n'; + } + + add_excludes_from_buffer(buf, size, base, baselen, el); + return 0; +} + struct exclude_list *add_exclude_list(struct dir_struct *dir, int group_type, const char *src) { diff --git a/dir.h b/dir.h index e3717055d19336..242de635e6b994 100644 --- a/dir.h +++ b/dir.h @@ -256,6 +256,10 @@ extern struct exclude_list *add_exclude_list(struct dir_struct *dir, extern int add_excludes_from_file_to_list(const char *fname, const char *base, int baselen, struct exclude_list *el, struct index_state *istate); extern void add_excludes_from_file(struct dir_struct *, const char *fname); +extern int add_excludes_from_blob_to_list( + struct object_id *oid, + const char *base, int baselen, + struct exclude_list *el); extern void parse_exclude_pattern(const char **string, int *patternlen, unsigned *flags, int *nowildcardlen); extern void add_exclude(const char *string, const char *base, int baselen, struct exclude_list *el, int srcpos); diff --git a/list-objects-filter-all.c b/list-objects-filter-all.c new file mode 100644 index 00000000000000..2faccb3e4fa3cd --- /dev/null +++ b/list-objects-filter-all.c @@ -0,0 +1,85 @@ +#include "cache.h" +#include "dir.h" +#include "tag.h" +#include "commit.h" +#include "tree.h" +#include "blob.h" +#include "diff.h" +#include "tree-walk.h" +#include "revision.h" +#include "list-objects.h" +#include "list-objects-filter-all.h" + +/* + * A filter for list-objects to omit ALL blobs from the traversal. + */ +struct filter_omit_all_blobs_data { + struct oidset2 omits; +}; + +static list_objects_filter_result filter_omit_all_blobs( + list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data_) +{ + struct filter_omit_all_blobs_data *filter_data = filter_data_; + int64_t object_length = -1; + unsigned long s; + enum object_type t; + + switch (filter_type) { + default: + die("unkown filter_type"); + return LOFR_ZERO; + + case LOFT_BEGIN_TREE: + assert(obj->type == OBJ_TREE); + /* always include all tree objects */ + return LOFR_MARK_SEEN | LOFR_SHOW; + + case LOFT_END_TREE: + assert(obj->type == OBJ_TREE); + return LOFR_ZERO; + + case LOFT_BLOB: + assert(obj->type == OBJ_BLOB); + assert((obj->flags & SEEN) == 0); + + /* + * Since we always omit all blobs (and never provisionally omit), + * we should never see a blob twice. + */ + assert(!oidset2_contains(&filter_data->omits, &obj->oid)); + + t = sha1_object_info(obj->oid.hash, &s); + assert(t == OBJ_BLOB); + object_length = (int64_t)((uint64_t)(s)); + + /* Insert OID into the omitted list. No need for a pathname. */ + oidset2_insert(&filter_data->omits, &obj->oid, t, object_length, + NULL); + return LOFR_MARK_SEEN; /* but not LOFR_SHOW (hard omit) */ + } +} + +void traverse_commit_list_omit_all_blobs( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + oidset2_foreach_cb print_omitted_object, + void *ctx_data) +{ + struct filter_omit_all_blobs_data d; + + memset(&d, 0, sizeof(d)); + + traverse_commit_list_worker(revs, show_commit, show_object, ctx_data, + filter_omit_all_blobs, &d); + + if (print_omitted_object) + oidset2_foreach(&d.omits, print_omitted_object, ctx_data); + + oidset2_clear(&d.omits); +} diff --git a/list-objects-filter-all.h b/list-objects-filter-all.h new file mode 100644 index 00000000000000..591589f543665d --- /dev/null +++ b/list-objects-filter-all.h @@ -0,0 +1,18 @@ +#ifndef LIST_OBJECTS_FILTER_ALL_H +#define LIST_OBJECTS_FILTER_ALL_H + +#include "oidset2.h" + +/* + * A filter for list-objects to omit ALL blobs + * from the traversal. + */ +void traverse_commit_list_omit_all_blobs( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + oidset2_foreach_cb print_omitted_object, + void *ctx_data); + +#endif /* LIST_OBJECTS_FILTER_ALL_H */ + diff --git a/list-objects-filter-large.c b/list-objects-filter-large.c new file mode 100644 index 00000000000000..1af39b69b6c674 --- /dev/null +++ b/list-objects-filter-large.c @@ -0,0 +1,108 @@ +#include "cache.h" +#include "dir.h" +#include "tag.h" +#include "commit.h" +#include "tree.h" +#include "blob.h" +#include "diff.h" +#include "tree-walk.h" +#include "revision.h" +#include "list-objects.h" +#include "list-objects-filter-large.h" + +/* + * A filter for list-objects to omit large blobs, + * but always include ".git*" special files. + */ +struct filter_omit_large_blobs_data { + struct oidset2 omits; + int64_t max_bytes; +}; + +static list_objects_filter_result filter_omit_large_blobs( + list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data_) +{ + struct filter_omit_large_blobs_data *filter_data = filter_data_; + int64_t object_length = -1; + unsigned long s; + enum object_type t; + + switch (filter_type) { + default: + die("unkown filter_type"); + return LOFR_ZERO; + + case LOFT_BEGIN_TREE: + assert(obj->type == OBJ_TREE); + /* always include all tree objects */ + return LOFR_MARK_SEEN | LOFR_SHOW; + + case LOFT_END_TREE: + assert(obj->type == OBJ_TREE); + return LOFR_ZERO; + + case LOFT_BLOB: + assert(obj->type == OBJ_BLOB); + assert((obj->flags & SEEN) == 0); + + /* + * If previously provisionally omitted (because of size), see if the + * current filename is special and force it to be included. + */ + if (oidset2_contains(&filter_data->omits, &obj->oid)) { + if ((strncmp(filename, ".git", 4) == 0) && filename[4]) { + oidset2_remove(&filter_data->omits, &obj->oid); + return LOFR_MARK_SEEN | LOFR_SHOW; + } + return LOFR_ZERO; /* continue provisionally omitting it */ + } + + t = sha1_object_info(obj->oid.hash, &s); + assert(t == OBJ_BLOB); + object_length = (int64_t)((uint64_t)(s)); + + if (object_length < filter_data->max_bytes) + return LOFR_MARK_SEEN | LOFR_SHOW; + + /* + * Provisionally omit it. We've already established that this blob + * is too big and doesn't have a special filename, so we WANT to + * omit it. However, there may be a special file elsewhere in the + * tree that references this same blob, so we cannot reject it yet. + * Leave the LOFR_ bits unset so that if the blob appears again in + * the traversal, we will be asked again. + * + * No need for a pathname, since we only test for special filenames + * above. + */ + oidset2_insert(&filter_data->omits, &obj->oid, t, object_length, + NULL); + return LOFR_ZERO; + } +} + +void traverse_commit_list_omit_large_blobs( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + oidset2_foreach_cb print_omitted_object, + void *ctx_data, + int64_t large_byte_limit) +{ + struct filter_omit_large_blobs_data d; + + memset(&d, 0, sizeof(d)); + d.max_bytes = large_byte_limit; + + traverse_commit_list_worker(revs, show_commit, show_object, ctx_data, + filter_omit_large_blobs, &d); + + if (print_omitted_object) + oidset2_foreach(&d.omits, print_omitted_object, ctx_data); + + oidset2_clear(&d.omits); +} diff --git a/list-objects-filter-large.h b/list-objects-filter-large.h new file mode 100644 index 00000000000000..4a5c772e30824e --- /dev/null +++ b/list-objects-filter-large.h @@ -0,0 +1,18 @@ +#ifndef LIST_OBJECTS_FILTER_LARGE_H +#define LIST_OBJECTS_FILTER_LARGE_H + +#include "oidset2.h" + +/* + * A filter for list-objects to omit large blobs, + * but always include ".git*" special files. + */ +void traverse_commit_list_omit_large_blobs( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + oidset2_foreach_cb print_omitted_object, + void *ctx_data, + int64_t large_byte_limit); + +#endif /* LIST_OBJECTS_FILTER_LARGE_H */ diff --git a/list-objects-filter-sparse.c b/list-objects-filter-sparse.c new file mode 100644 index 00000000000000..9dbfbd1f481fe4 --- /dev/null +++ b/list-objects-filter-sparse.c @@ -0,0 +1,226 @@ +#include "cache.h" +#include "dir.h" +#include "tag.h" +#include "commit.h" +#include "tree.h" +#include "blob.h" +#include "diff.h" +#include "tree-walk.h" +#include "revision.h" +#include "list-objects.h" +#include "list-objects-filter-sparse.h" + +/* + * A filter driven by a sparse-checkout specification to only + * include blobs that a sparse checkout would populate. + * + * The sparse-checkout spec can be loaded from a blob with the + * given OID or from a local pathname. We allow an OID because + * the repo may be bare or we may be doing the filtering on the + * server. + */ +struct frame { + int defval; + int child_prov_omit : 1; +}; + +struct filter_use_sparse_data { + struct oidset2 omits; + struct exclude_list el; + + size_t nr, alloc; + struct frame *array_frame; +}; + +static list_objects_filter_result filter_use_sparse( + list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data_) +{ + struct filter_use_sparse_data *filter_data = filter_data_; + int64_t object_length = -1; + int val, dtype; + unsigned long s; + enum object_type t; + struct frame *frame; + + switch (filter_type) { + default: + die("unkown filter_type"); + return LOFR_ZERO; + + case LOFT_BEGIN_TREE: + assert(obj->type == OBJ_TREE); + dtype = DT_DIR; + val = is_excluded_from_list(pathname, strlen(pathname), + filename, &dtype, &filter_data->el, + &the_index); + if (val < 0) + val = filter_data->array_frame[filter_data->nr].defval; + + ALLOC_GROW(filter_data->array_frame, filter_data->nr + 1, + filter_data->alloc); + filter_data->nr++; + filter_data->array_frame[filter_data->nr].defval = val; + filter_data->array_frame[filter_data->nr].child_prov_omit = 0; + + /* + * A directory with this tree OID may appear in multiple + * places in the tree. (Think of a directory move, with + * no other changes.) And with a different pathname, the + * is_excluded...() results for this directory and items + * contained within it may be different. So we cannot + * mark it SEEN (yet), since that will prevent process_tree() + * from revisiting this tree object with other pathnames. + * + * Only SHOW the tree object the first time we visit this + * tree object. + * + * We always show all tree objects. A future optimization + * may want to attempt to narrow this. + */ + if (obj->flags & FILTER_REVISIT) + return LOFR_ZERO; + obj->flags |= FILTER_REVISIT; + return LOFR_SHOW; + + case LOFT_END_TREE: + assert(obj->type == OBJ_TREE); + assert(filter_data->nr > 0); + + frame = &filter_data->array_frame[filter_data->nr]; + filter_data->nr--; + + /* + * Tell our parent directory if any of our children were + * provisionally omitted. + */ + filter_data->array_frame[filter_data->nr].child_prov_omit |= + frame->child_prov_omit; + + /* + * If there are NO provisionally omitted child objects (ALL child + * objects in this folder were INCLUDED), then we can mark the + * folder as SEEN (so we will not have to revisit it again). + */ + if (!frame->child_prov_omit) + return LOFR_MARK_SEEN; + return LOFR_ZERO; + + case LOFT_BLOB: + assert(obj->type == OBJ_BLOB); + assert((obj->flags & SEEN) == 0); + + frame = &filter_data->array_frame[filter_data->nr]; + + /* + * If we previously provisionally omitted this blob because + * its pathname was not in the sparse-checkout AND this + * reference to the blob has the same pathname, we can avoid + * repeating the exclusion logic on this pathname and just + * continue to provisionally omit it. + */ + if (obj->flags & FILTER_REVISIT) { + struct oidset2_entry *entry_prev; + entry_prev = oidset2_get(&filter_data->omits, &obj->oid); + if (entry_prev && !strcmp(pathname, entry_prev->pathname)) { + frame->child_prov_omit = 1; + return LOFR_ZERO; + } + } + + dtype = DT_REG; + val = is_excluded_from_list(pathname, strlen(pathname), + filename, &dtype, &filter_data->el, + &the_index); + if (val < 0) + val = frame->defval; + if (val > 0) + return LOFR_MARK_SEEN | LOFR_SHOW; + + /* + * Lookup the blob to get its size and confirm its type, + * if present. If it is missing, we substitute -1 for + * the length. + */ + t = sha1_object_info(obj->oid.hash, &s); + if (t == OBJ_BLOB) + object_length = (int64_t)((uint64_t)(s)); + + /* + * Provisionally omit it. We've already established that + * this pathname is not in the sparse-checkout specification, + * so we WANT to omit this blob. However, a pathname elsewhere + * in the tree may also reference this same blob, so we cannot + * reject it yet. Leave the LOFR_ bits unset so that if the + * blob appears again in the traversal, we will be asked again. + * + * The pathname we associate with this omit is just the first + * one we saw for this blob. Other instances of this blob may + * have other pathnames and that is fine. We just use it for + * perf because most of the time, the blob will be in the same + * place as we walk the commits. + */ + oidset2_insert(&filter_data->omits, &obj->oid, obj->type, + object_length, pathname); + obj->flags |= FILTER_REVISIT; + frame->child_prov_omit = 1; + return LOFR_ZERO; + } +} + +void traverse_commit_list_use_blob( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + oidset2_foreach_cb print_omitted_object, + void *ctx_data, + struct object_id *oid) +{ + struct filter_use_sparse_data d; + + memset(&d, 0, sizeof(d)); + if (add_excludes_from_blob_to_list(oid, NULL, 0, &d.el) < 0) + die("could not load filter specification"); + + ALLOC_GROW(d.array_frame, d.nr + 1, d.alloc); + d.array_frame[d.nr].defval = 0; /* default to include */ + d.array_frame[d.nr].child_prov_omit = 0; + + traverse_commit_list_worker(revs, show_commit, show_object, ctx_data, + filter_use_sparse, &d); + + if (print_omitted_object) + oidset2_foreach(&d.omits, print_omitted_object, ctx_data); + + oidset2_clear(&d.omits); +} + +void traverse_commit_list_use_path( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + oidset2_foreach_cb print_omitted_object, + void *ctx_data, + const char *path) +{ + struct filter_use_sparse_data d; + + memset(&d, 0, sizeof(d)); + if (add_excludes_from_file_to_list(path, NULL, 0, &d.el, NULL) < 0) + die("could not load filter specification"); + + ALLOC_GROW(d.array_frame, d.nr + 1, d.alloc); + d.array_frame[d.nr].defval = 0; /* default to include */ + d.array_frame[d.nr].child_prov_omit = 0; + + traverse_commit_list_worker(revs, show_commit, show_object, ctx_data, + filter_use_sparse, &d); + + if (print_omitted_object) + oidset2_foreach(&d.omits, print_omitted_object, ctx_data); + + oidset2_clear(&d.omits); +} diff --git a/list-objects-filter-sparse.h b/list-objects-filter-sparse.h new file mode 100644 index 00000000000000..aa8939079b5c0a --- /dev/null +++ b/list-objects-filter-sparse.h @@ -0,0 +1,30 @@ +#ifndef LIST_OBJECTS_FILTERS_SPARSE_H +#define LIST_OBJECTS_FILTERS_SPARSE_H + +#include "oidset2.h" + +/* + * A filter driven by a sparse-checkout specification to only + * include blobs that a sparse checkout would populate. + * + * The sparse-checkout spec can be loaded from a blob with the + * given OID, a blob with a blob-ish path, or from a local pathname. + * We allow an OID because the repo may be bare or we may be doing + * the filtering on the server. + */ +void traverse_commit_list_use_blob( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + oidset2_foreach_cb print_omitted_object, + void *ctx_data, + struct object_id *oid); +void traverse_commit_list_use_path( + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + oidset2_foreach_cb print_omitted_object, + void *ctx_data, + const char *path); + +#endif /* LIST_OBJECTS_FILTERS_SPARSE_H */ diff --git a/list-objects.c b/list-objects.c index b3931fa434dc99..0f063d9e1abc16 100644 --- a/list-objects.c +++ b/list-objects.c @@ -7,16 +7,22 @@ #include "tree-walk.h" #include "revision.h" #include "list-objects.h" +#include "list-objects-filter-all.h" +#include "list-objects-filter-large.h" +#include "list-objects-filter-sparse.h" static void process_blob(struct rev_info *revs, struct blob *blob, show_object_fn show, struct strbuf *path, const char *name, - void *cb_data) + void *cb_data, + filter_object_fn filter, + void *filter_data) { struct object *obj = &blob->object; size_t pathlen; + list_objects_filter_result r = LOFR_MARK_SEEN | LOFR_SHOW; if (!revs->blob_objects) return; @@ -24,11 +30,15 @@ static void process_blob(struct rev_info *revs, die("bad blob object"); if (obj->flags & (UNINTERESTING | SEEN)) return; - obj->flags |= SEEN; pathlen = path->len; strbuf_addstr(path, name); - show(obj, path->buf, cb_data); + if (filter) + r = filter(LOFT_BLOB, obj, path->buf, &path->buf[pathlen], filter_data); + if (r & LOFR_MARK_SEEN) + obj->flags |= SEEN; + if (r & LOFR_SHOW) + show(obj, path->buf, cb_data); strbuf_setlen(path, pathlen); } @@ -69,7 +79,9 @@ static void process_tree(struct rev_info *revs, show_object_fn show, struct strbuf *base, const char *name, - void *cb_data) + void *cb_data, + filter_object_fn filter, + void *filter_data) { struct object *obj = &tree->object; struct tree_desc desc; @@ -77,6 +89,7 @@ static void process_tree(struct rev_info *revs, enum interesting match = revs->diffopt.pathspec.nr == 0 ? all_entries_interesting: entry_not_interesting; int baselen = base->len; + list_objects_filter_result r = LOFR_MARK_SEEN | LOFR_SHOW; if (!revs->tree_objects) return; @@ -90,9 +103,13 @@ static void process_tree(struct rev_info *revs, die("bad tree object %s", oid_to_hex(&obj->oid)); } - obj->flags |= SEEN; strbuf_addstr(base, name); - show(obj, base->buf, cb_data); + if (filter) + r = filter(LOFT_BEGIN_TREE, obj, base->buf, &base->buf[baselen], filter_data); + if (r & LOFR_MARK_SEEN) + obj->flags |= SEEN; + if (r & LOFR_SHOW) + show(obj, base->buf, cb_data); if (base->len) strbuf_addch(base, '/'); @@ -112,7 +129,7 @@ static void process_tree(struct rev_info *revs, process_tree(revs, lookup_tree(entry.oid), show, base, entry.path, - cb_data); + cb_data, filter, filter_data); else if (S_ISGITLINK(entry.mode)) process_gitlink(revs, entry.oid->hash, show, base, entry.path, @@ -121,8 +138,17 @@ static void process_tree(struct rev_info *revs, process_blob(revs, lookup_blob(entry.oid), show, base, entry.path, - cb_data); + cb_data, filter, filter_data); } + + if (filter) { + r = filter(LOFT_END_TREE, obj, base->buf, &base->buf[baselen], filter_data); + if (r & LOFR_MARK_SEEN) + obj->flags |= SEEN; + if (r & LOFR_SHOW) + show(obj, base->buf, cb_data); + } + strbuf_setlen(base, baselen); free_tree_buffer(tree); } @@ -183,10 +209,10 @@ static void add_pending_tree(struct rev_info *revs, struct tree *tree) add_pending_object(revs, &tree->object, ""); } -void traverse_commit_list(struct rev_info *revs, - show_commit_fn show_commit, - show_object_fn show_object, - void *data) +void traverse_commit_list_worker( + struct rev_info *revs, + show_commit_fn show_commit, show_object_fn show_object, void *show_data, + filter_object_fn filter, void *filter_data) { int i; struct commit *commit; @@ -200,7 +226,7 @@ void traverse_commit_list(struct rev_info *revs, */ if (commit->tree) add_pending_tree(revs, commit->tree); - show_commit(commit, data); + show_commit(commit, show_data); } for (i = 0; i < revs->pending.nr; i++) { struct object_array_entry *pending = revs->pending.objects + i; @@ -211,19 +237,19 @@ void traverse_commit_list(struct rev_info *revs, continue; if (obj->type == OBJ_TAG) { obj->flags |= SEEN; - show_object(obj, name, data); + show_object(obj, name, show_data); continue; } if (!path) path = ""; if (obj->type == OBJ_TREE) { process_tree(revs, (struct tree *)obj, show_object, - &base, path, data); + &base, path, show_data, filter, filter_data); continue; } if (obj->type == OBJ_BLOB) { process_blob(revs, (struct blob *)obj, show_object, - &base, path, data); + &base, path, show_data, filter, filter_data); continue; } die("unknown pending object %s (%s)", @@ -232,3 +258,45 @@ void traverse_commit_list(struct rev_info *revs, object_array_clear(&revs->pending); strbuf_release(&base); } + +void traverse_commit_list(struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + void *show_data) +{ + traverse_commit_list_worker( + revs, + show_commit, show_object, show_data, + NULL, NULL); +} + +void traverse_commit_list_filtered( + struct object_filter_options *filter_options, + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + oidset2_foreach_cb print_omitted_object, + void *show_data) +{ + if (filter_options->omit_all_blobs) + traverse_commit_list_omit_all_blobs( + revs, show_commit, show_object, print_omitted_object, show_data); + + else if (filter_options->omit_large_blobs) + traverse_commit_list_omit_large_blobs( + revs, show_commit, show_object, print_omitted_object, show_data, + (int64_t)(uint64_t)filter_options->large_byte_limit); + + else if (filter_options->use_blob) + traverse_commit_list_use_blob( + revs, show_commit, show_object, print_omitted_object, show_data, + &filter_options->sparse_oid); + + else if (filter_options->use_path) + traverse_commit_list_use_path( + revs, show_commit, show_object, print_omitted_object, show_data, + filter_options->sparse_value); + + else + die("unspecified list-objects filter"); +} diff --git a/list-objects.h b/list-objects.h index 0cebf8585cb179..a8acedcdce250e 100644 --- a/list-objects.h +++ b/list-objects.h @@ -1,6 +1,9 @@ #ifndef LIST_OBJECTS_H #define LIST_OBJECTS_H +#include "oidset2.h" +#include "object-filter.h" + typedef void (*show_commit_fn)(struct commit *, void *); typedef void (*show_object_fn)(struct object *, const char *, void *); void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, void *); @@ -8,4 +11,42 @@ void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, voi typedef void (*show_edge_fn)(struct commit *); void mark_edges_uninteresting(struct rev_info *, show_edge_fn); +enum list_objects_filter_result { + LOFR_ZERO = 0, + LOFR_MARK_SEEN = 1<<0, + LOFR_SHOW = 1<<1, +}; + +/* See object.h and revision.h */ +#define FILTER_REVISIT (1<<25) + +enum list_objects_filter_type { + LOFT_BEGIN_TREE, + LOFT_END_TREE, + LOFT_BLOB +}; + +typedef enum list_objects_filter_result list_objects_filter_result; +typedef enum list_objects_filter_type list_objects_filter_type; + +typedef list_objects_filter_result (*filter_object_fn)( + list_objects_filter_type filter_type, + struct object *obj, + const char *pathname, + const char *filename, + void *filter_data); + +void traverse_commit_list_worker( + struct rev_info *, + show_commit_fn, show_object_fn, void *show_data, + filter_object_fn filter, void *filter_data); + +void traverse_commit_list_filtered( + struct object_filter_options *filter_options, + struct rev_info *revs, + show_commit_fn show_commit, + show_object_fn show_object, + oidset2_foreach_cb print_omitted_object, + void *show_data); + #endif diff --git a/object-filter.c b/object-filter.c new file mode 100644 index 00000000000000..1c74bd125aedeb --- /dev/null +++ b/object-filter.c @@ -0,0 +1,269 @@ +#include "cache.h" +#include "commit.h" +#include "config.h" +#include "revision.h" +#include "list-objects.h" +#include "oidset2.h" +#include "list-objects-filter-all.h" +#include "list-objects-filter-large.h" +#include "list-objects-filter-sparse.h" +#include "object-filter.h" + +int parse_filter_omit_all_blobs(struct object_filter_options *filter_options) +{ + if (object_filter_enabled(filter_options)) + die(_("multiple object filter types cannot be combined")); + + filter_options->omit_all_blobs = 1; + return 0; +} + +int parse_filter_omit_large_blobs(struct object_filter_options *filter_options, + const char *arg) +{ + if (object_filter_enabled(filter_options)) + die(_("multiple object filter types cannot be combined")); + + filter_options->omit_large_blobs = 1; + + /* we allow "[kmg]" */ + if (!git_parse_ulong(arg, &filter_options->large_byte_limit)) + die(_("invalid size limit for large object filter")); + + filter_options->large_byte_limit_string = strdup(arg); + return 0; +} + +int parse_filter_use_blob(struct object_filter_options *filter_options, + const char *arg) +{ + struct object_context oc; + + if (object_filter_enabled(filter_options)) + die(_("multiple object filter types cannot be combined")); + + filter_options->use_blob = 1; + + /* + * The command line argument needs to resolve to an known OID + * representing the content of the desired sparse-checkout file. + * + * We allow various syntax forms for the convenience of the user. + * See sha1_name.c:get_oid_with_context_1(). + * + * Try to evaluate the arg locally in case they use one of the + * convenience patterns. This must resolve to a blob. + */ + if (get_oid_with_context(arg, GET_OID_BLOB, + &filter_options->sparse_oid, &oc)) { + /* + * If that fails, keep the original string in case a client + * command wants to send it to the server. This allows the + * client to name an OID for a blob they don't have. + */ + filter_options->sparse_value = strdup(arg); + oidcpy(&filter_options->sparse_oid, &null_oid); + } else { + /* + * Round-trip the found OID to normalize it. + */ + filter_options->sparse_value = + strdup(oid_to_hex(&filter_options->sparse_oid)); + } + return 0; +} + +int parse_filter_use_path(struct object_filter_options *filter_options, + const char *arg) +{ + if (object_filter_enabled(filter_options)) + die(_("multiple object filter types cannot be combined")); + + filter_options->use_path = 1; + + /* + * The command line argument needs to resolve to a local file + * containing the content of the desired sparse-checkout file. + */ + filter_options->sparse_value = strdup(arg); + return 0; +} + +int parse_filter_print_omitted(struct object_filter_options *filter_options) +{ + filter_options->print_omitted = 1; + return 0; +} + +int parse_filter_print_missing(struct object_filter_options *filter_options) +{ + filter_options->print_missing = 1; + return 0; +} + +int parse_filter_relax(struct object_filter_options *filter_options) +{ + filter_options->relax = 1; + return 0; +} + +int opt_parse_filter_omit_all_blobs(const struct option *opt, + const char *arg, int unset) +{ + struct object_filter_options *filter_options = opt->value; + + assert(!arg); + assert(!unset); + + return parse_filter_omit_all_blobs(filter_options); +} + +int opt_parse_filter_omit_large_blobs(const struct option *opt, + const char *arg, int unset) +{ + struct object_filter_options *filter_options = opt->value; + + assert(arg); + assert(!unset); + + return parse_filter_omit_large_blobs(filter_options, arg); +} + +int opt_parse_filter_use_blob(const struct option *opt, + const char *arg, int unset) +{ + struct object_filter_options *filter_options = opt->value; + + assert(arg); + assert(!unset); + + return parse_filter_use_blob(filter_options, arg); +} + +int opt_parse_filter_use_path(const struct option *opt, + const char *arg, int unset) +{ + struct object_filter_options *filter_options = opt->value; + + assert(arg); + assert(!unset); + + return parse_filter_use_path(filter_options, arg); +} + +int opt_parse_filter_print_omitted(const struct option *opt, + const char *arg, int unset) +{ + struct object_filter_options *filter_options = opt->value; + + assert(!arg); + assert(!unset); + + return parse_filter_print_omitted(filter_options); +} + +int opt_parse_filter_print_missing(const struct option *opt, + const char *arg, int unset) +{ + struct object_filter_options *filter_options = opt->value; + + assert(!arg); + assert(!unset); + + return parse_filter_print_missing(filter_options); +} + +int opt_parse_filter_relax(const struct option *opt, + const char *arg, int unset) +{ + struct object_filter_options *filter_options = opt->value; + + assert(!arg); + assert(!unset); + + return parse_filter_relax(filter_options); +} + +int object_filter_hand_parse_arg(struct object_filter_options *filter_options, + const char *arg, + int allow_print_omitted, + int allow_print_missing, + int allow_relax) +{ + if (!strcmp(arg, ("--"CL_ARG_FILTER_OMIT_ALL_BLOBS))) { + parse_filter_omit_all_blobs(filter_options); + return 1; + } + if (skip_prefix(arg, ("--"CL_ARG_FILTER_OMIT_LARGE_BLOBS"="), &arg)) { + parse_filter_omit_large_blobs(filter_options, arg); + return 1; + } + if (skip_prefix(arg, ("--"CL_ARG_FILTER_USE_BLOB"="), &arg)) { + parse_filter_use_blob(filter_options, arg); + return 1; + } + if (skip_prefix(arg, ("--"CL_ARG_FILTER_USE_PATH"="), &arg)) { + parse_filter_use_path(filter_options, arg); + return 1; + } + + if (allow_print_omitted && + !strcmp(arg, ("--"CL_ARG_FILTER_PRINT_OMITTED))) { + parse_filter_print_omitted(filter_options); + return 1; + } + + if (allow_print_missing && + !strcmp(arg, ("--"CL_ARG_FILTER_PRINT_MISSING))) { + parse_filter_print_missing(filter_options); + return 1; + } + + if (allow_relax && !strcmp(arg, ("--"CL_ARG_FILTER_RELAX))) { + parse_filter_relax(filter_options); + return 1; + } + + return 0; +} + +int object_filter_hand_parse_protocol(struct object_filter_options *filter_options, + const char *arg, + int allow_print_omitted, + int allow_print_missing, + int allow_relax) +{ + if (!strcmp(arg, CL_ARG_FILTER_OMIT_ALL_BLOBS)) { + parse_filter_omit_all_blobs(filter_options); + return 1; + } + if (skip_prefix(arg, (CL_ARG_FILTER_OMIT_LARGE_BLOBS" "), &arg)) { + parse_filter_omit_large_blobs(filter_options, arg); + return 1; + } + if (skip_prefix(arg, (CL_ARG_FILTER_USE_BLOB" "), &arg)) { + parse_filter_use_blob(filter_options, arg); + return 1; + } + if (skip_prefix(arg, (CL_ARG_FILTER_USE_PATH" "), &arg)) { + parse_filter_use_path(filter_options, arg); + return 1; + } + + if (allow_print_omitted && + !strcmp(arg, CL_ARG_FILTER_PRINT_OMITTED)) { + parse_filter_print_omitted(filter_options); + return 1; + } + if (allow_print_missing && + !strcmp(arg, CL_ARG_FILTER_PRINT_MISSING)) { + parse_filter_print_missing(filter_options); + return 1; + } + if (allow_relax && !strcmp(arg, CL_ARG_FILTER_RELAX)) { + parse_filter_relax(filter_options); + return 1; + } + + return 0; +} diff --git a/object-filter.h b/object-filter.h new file mode 100644 index 00000000000000..fde09a4b67eaae --- /dev/null +++ b/object-filter.h @@ -0,0 +1,173 @@ +#ifndef OBJECT_FILTER_H +#define OBJECT_FILTER_H + +#include "parse-options.h" + +/* + * Common declarations and utilities for filtering objects (such as omitting + * large blobs) in list_objects:traverse_commit_list() and git-rev-list. + */ + +struct object_filter_options { + /* + * File pathname or blob-ish path/OID (that get_oid_with_context() can + * use to find the blob containing the sparse-checkout specification. + * This is only used when use_blob or use_path is set. + */ + const char *sparse_value; + struct object_id sparse_oid; + + /* + * Blob size byte limit for filtering. Only blobs smaller than this + * value will be included. A value of zero, omits all blobs. + * only used when omit_large_blobs is set. Integer and string versions + * of this are kept for convenience. The string version may contain + * a [kmg] suffix. + */ + unsigned long large_byte_limit; + const char *large_byte_limit_string; + + /* Valid filter types (only one may be used at a time) */ + unsigned omit_all_blobs : 1; + unsigned omit_large_blobs : 1; + unsigned use_blob : 1; + unsigned use_path : 1; + + /* + * True if rev-list should print a list of the objects omitted + * by this invocation of a filter. + */ + unsigned print_omitted : 1; + + /* + * True if rev-list should print a list of missing objects. + * Objects can be missing because of a previously filtered + * clone or fetch. The set reported here can also be filtered + * by the current filter in effect. + */ + unsigned print_missing : 1; + + /* True to suppress missing object errors during consistency checks */ + unsigned relax : 1; +}; + +/* + * Return true if a filter is enabled. + */ +inline int object_filter_enabled(const struct object_filter_options *p) +{ + return p->omit_all_blobs || + p->omit_large_blobs || + p->use_blob || + p->use_path; +} + +/* Normalized command line arguments */ +#define CL_ARG_FILTER_OMIT_ALL_BLOBS "filter-omit-all-blobs" +#define CL_ARG_FILTER_OMIT_LARGE_BLOBS "filter-omit-large-blobs" +#define CL_ARG_FILTER_USE_BLOB "filter-use-blob" +#define CL_ARG_FILTER_USE_PATH "filter-use-path" +#define CL_ARG_FILTER_PRINT_OMITTED "filter-print-omitted" +#define CL_ARG_FILTER_PRINT_MISSING "filter-print-missing" +#define CL_ARG_FILTER_RELAX "filter-relax" + +/* + * Common command line argument parsing for object-filter-related + * arguments (whether from a hand-parsed or parse-options style + * parser. + */ +int parse_filter_omit_all_blobs(struct object_filter_options *filter_options); +int parse_filter_omit_large_blobs(struct object_filter_options *filter_options, + const char *arg); +int parse_filter_use_blob(struct object_filter_options *filter_options, + const char *arg); +int parse_filter_use_path(struct object_filter_options *filter_options, + const char *arg); +int parse_filter_print_omitted(struct object_filter_options *filter_options); +int parse_filter_print_missing(struct object_filter_options *filter_options); +int parse_filter_relax(struct object_filter_options *filter_options); + +/* + * Common command line argument parsers for object-filter-related + * arguments comming from parse-options style parsers. + */ + +int opt_parse_filter_omit_all_blobs(const struct option *opt, + const char *arg, int unset); +int opt_parse_filter_omit_large_blobs(const struct option *opt, + const char *arg, int unset); +int opt_parse_filter_use_blob(const struct option *opt, + const char *arg, int unset); +int opt_parse_filter_use_path(const struct option *opt, + const char *arg, int unset); +int opt_parse_filter_print_omitted(const struct option *opt, + const char *arg, int unset); +int opt_parse_filter_print_missing(const struct option *opt, + const char *arg, int unset); +int opt_parse_filter_relax(const struct option *opt, + const char *arg, int unset); + +#define OPT_PARSE_FILTER_OMIT_ALL_BLOBS(fo) \ + { OPTION_CALLBACK, 0, CL_ARG_FILTER_OMIT_ALL_BLOBS, fo, NULL, \ + N_("omit all blobs from result"), PARSE_OPT_NOARG | PARSE_OPT_NONEG, \ + opt_parse_filter_omit_all_blobs } + +#define OPT_PARSE_FILTER_OMIT_LARGE_BLOBS(fo) \ + { OPTION_CALLBACK, 0, CL_ARG_FILTER_OMIT_LARGE_BLOBS, fo, N_("size"), \ + N_("omit large blobs from result"), PARSE_OPT_NONEG, \ + opt_parse_filter_omit_large_blobs } + +#define OPT_PARSE_FILTER_USE_BLOB(fo) \ + { OPTION_CALLBACK, 0, CL_ARG_FILTER_USE_BLOB, fo, N_("object"), \ + N_("filter results using sparse-checkout specification"), PARSE_OPT_NONEG, \ + opt_parse_filter_use_blob } + +#define OPT_PARSE_FILTER_USE_PATH(fo) \ + { OPTION_CALLBACK, 0, CL_ARG_FILTER_USE_PATH, fo, N_("path"), \ + N_("filter results using sparse-checkout specification"), PARSE_OPT_NONEG, \ + opt_parse_filter_use_path } + +#define OPT_PARSE_FILTER_PRINT_OMITTED(fo) \ + { OPTION_CALLBACK, 0, CL_ARG_FILTER_PRINT_OMITTED, fo, NULL, \ + N_("print list of omitted objects"), PARSE_OPT_NOARG | PARSE_OPT_NONEG, \ + opt_parse_filter_print_omitted } + +#define OPT_PARSE_FILTER_PRINT_MISSING(fo) \ + { OPTION_CALLBACK, 0, CL_ARG_FILTER_PRINT_MISSING, fo, NULL, \ + N_("print list of missing objects"), PARSE_OPT_NOARG | PARSE_OPT_NONEG, \ + opt_parse_filter_print_missing } + +#define OPT_PARSE_FILTER_RELAX(fo) \ + { OPTION_CALLBACK, 0, CL_ARG_FILTER_RELAX, fo, NULL, \ + N_("relax consistency checks for previously omitted objects"), \ + PARSE_OPT_NOARG | PARSE_OPT_NONEG, opt_parse_filter_relax } + +/* + * Hand parse known object-filter command line options. + * Use this when the caller DOES NOT use the normal OPT_ + * routines. + * + * Here we assume args of the form "--" or "--=". + * Note the literal dash-dash and equals. + * + * Returns 1 if we handled the argument. + */ +int object_filter_hand_parse_arg(struct object_filter_options *filter_options, + const char *arg, + int allow_print_omitted, + int allow_print_missing, + int allow_relax); + +/* + * Hand parse known object-filter protocol lines. + * + * Here we assume args of the form "" or " ". + * Note the literal space before between the key and value. + */ +int object_filter_hand_parse_protocol(struct object_filter_options *filter_options, + const char *arg, + int allow_print_omitted, + int allow_print_missing, + int allow_relax); + +#endif /* OBJECT_FILTER_H */ diff --git a/oidset2.c b/oidset2.c new file mode 100644 index 00000000000000..3c0e625039bf51 --- /dev/null +++ b/oidset2.c @@ -0,0 +1,107 @@ +#include "cache.h" +#include "oidset2.h" + +static int oidset2_hashcmp(const void *unused_cmp_data, + const void *va, const void *vb, + const void *vkey) +{ + const struct oidset2_entry *a = va, *b = vb; + const struct object_id *key = vkey; + return oidcmp(&a->oid, key ? key : &b->oid); +} + +struct oidset2_entry *oidset2_get(const struct oidset2 *set, const struct object_id *oid) +{ + struct hashmap_entry key; + struct oidset2_entry *value; + + if (!set->map.cmpfn) + return NULL; + + hashmap_entry_init(&key, sha1hash(oid->hash)); + value = hashmap_get(&set->map, &key, oid); + + return value; +} + +int oidset2_contains(const struct oidset2 *set, const struct object_id *oid) +{ + return !!oidset2_get(set, oid); +} + +int oidset2_insert(struct oidset2 *set, const struct object_id *oid, + enum object_type type, int64_t object_length, + const char *pathname) +{ + struct oidset2_entry *entry; + + if (!set->map.cmpfn) + hashmap_init(&set->map, oidset2_hashcmp, NULL, 0); + + if (oidset2_contains(set, oid)) + return 1; + + entry = xcalloc(1, sizeof(*entry)); + hashmap_entry_init(&entry->hash, sha1hash(oid->hash)); + oidcpy(&entry->oid, oid); + + entry->type = type; + entry->object_length = object_length; + if (pathname) + entry->pathname = strdup(pathname); + + hashmap_add(&set->map, entry); + return 0; +} + +void oidset2_remove(struct oidset2 *set, const struct object_id *oid) +{ + struct hashmap_entry key; + struct oidset2_entry *e; + + hashmap_entry_init(&key, sha1hash(oid->hash)); + e = hashmap_remove(&set->map, &key, oid); + + free(e->pathname); + free(e); +} + +void oidset2_clear(struct oidset2 *set) +{ + hashmap_free(&set->map, 1); +} + +static int oidset2_cmp(const void *a, const void *b) +{ + const struct oidset2_entry *ae = *((const struct oidset2_entry **)a); + const struct oidset2_entry *be = *((const struct oidset2_entry **)b); + + return oidcmp(&ae->oid, &be->oid); +} + +void oidset2_foreach(struct oidset2 *set, oidset2_foreach_cb cb, void *cb_data) +{ + struct hashmap_iter iter; + struct oidset2_entry **array; + struct oidset2_entry *e; + int j, k; + + if (!set || !set->map.cmpfn) + return; + + array = xcalloc(hashmap_get_size(&set->map), sizeof(*e)); + + hashmap_iter_init(&set->map, &iter); + k = 0; + while ((e = hashmap_iter_next(&iter))) + array[k++] = e; + + QSORT(array, k, oidset2_cmp); + + for (j = 0; j < k; j++) { + e = array[j]; + cb(j, k, e, cb_data); + } + + free(array); +} diff --git a/oidset2.h b/oidset2.h new file mode 100644 index 00000000000000..67d8a5a13f5df0 --- /dev/null +++ b/oidset2.h @@ -0,0 +1,58 @@ +#ifndef OIDSET2_H +#define OIDSET2_H + +/** + * oidset2 is a variant of oidset, but allows additional fields for each object. + */ + +/** + * A single oidset2; should be zero-initialized (or use OIDSET2_INIT). + */ +struct oidset2 { + struct hashmap map; +}; + +#define OIDSET2_INIT { { NULL } } + +struct oidset2_entry { + struct hashmap_entry hash; + struct object_id oid; + + enum object_type type; + int64_t object_length; /* This is SIGNED. Use -1 when unknown. */ + char *pathname; +}; + +struct oidset2_entry *oidset2_get(const struct oidset2 *set, const struct object_id *oid); + +/** + * Returns true iff `set` contains `oid`. + */ +int oidset2_contains(const struct oidset2 *set, const struct object_id *oid); + +/** + * Insert the oid into the set; a copy is made, so "oid" does not need + * to persist after this function is called. + * + * Returns 1 if the oid was already in the set, 0 otherwise. This can be used + * to perform an efficient check-and-add. + */ +int oidset2_insert(struct oidset2 *set, const struct object_id *oid, + enum object_type type, int64_t object_length, + const char *pathname); + +void oidset2_remove(struct oidset2 *set, const struct object_id *oid); + +typedef void (*oidset2_foreach_cb)( + int i, int i_limit, + struct oidset2_entry *e, void *cb_data); + +void oidset2_foreach(struct oidset2 *set, oidset2_foreach_cb cb, void *cb_data); + +/** + * Remove all entries from the oidset2, freeing any resources associated with + * it. + */ +void oidset2_clear(struct oidset2 *set); + +#endif /* OIDSET2_H */ diff --git a/sha1_name.c b/sha1_name.c index 134ac9742f9eed..d1daabff36a2a1 100644 --- a/sha1_name.c +++ b/sha1_name.c @@ -1646,6 +1646,6 @@ void maybe_die_on_misspelt_object_name(const char *name, const char *prefix) int get_oid_with_context(const char *str, unsigned flags, struct object_id *oid, struct object_context *oc) { if (flags & GET_OID_FOLLOW_SYMLINKS && flags & GET_OID_ONLY_TO_DIE) - die("BUG: incompatible flags for get_sha1_with_context"); + die("BUG: incompatible flags for get_oid_with_context"); return get_oid_with_context_1(str, flags, NULL, oid, oc); } diff --git a/t/t6112-rev-list-filters-objects.sh b/t/t6112-rev-list-filters-objects.sh new file mode 100755 index 00000000000000..66ff022fce9b4e --- /dev/null +++ b/t/t6112-rev-list-filters-objects.sh @@ -0,0 +1,237 @@ +#!/bin/sh + +test_description='git rev-list with object filtering' + +. ./test-lib.sh + +# test the omit-all filter + +test_expect_success 'setup' ' + echo "{print \$1}" >print_1.awk && + echo "{print \$2}" >print_2.awk && + + for n in 1 2 3 4 5 + do + echo $n > file.$n + git add file.$n + git commit -m "$n" + done +' + +# Verify the omitted ("~OID") lines match the predicted list of OIDs. +test_expect_success 'omit-all-blobs omitted 5 blobs' ' + git ls-files -s file.1 file.2 file.3 file.4 file.5 \ + | awk -f print_2.awk \ + | sort >expected && + git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-all-blobs \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +# Verify the complete OID list matches the unfiltered OIDs plus the omitted OIDs. +test_expect_success 'omit-all-blobs nothing else changed' ' + git rev-list HEAD --objects \ + | awk -f print_1.awk \ + | sort >expected && + git rev-list HEAD --objects --filter-print-omitted --filter-omit-all-blobs \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +# test the size-based filtering. + +test_expect_success 'setup_large' ' + for n in 1000 10000 + do + printf "%"$n"s" X > large.$n + git add large.$n + git commit -m "$n" + done +' + +test_expect_success 'omit-large-blobs omit 2 blobs' ' + git ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=500 \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +test_expect_success 'omit-large-blobs nothing else changed' ' + git rev-list HEAD --objects \ + | awk -f print_1.awk \ + | sort >expected && + git rev-list HEAD --objects --filter-print-omitted --filter-omit-large-blobs=500 \ + | awk -f print_1.awk \ + | sed "s/~//" \ + | sort >observed && + test_cmp observed expected +' + +# boundary test around the size parameter. +# filter is strictly less than the value, so size 500 and 1000 should have the +# same results, but 1001 should filter more. + +test_expect_success 'omit-large-blobs omit 2 blobs' ' + git ls-files -s large.1000 large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1000 \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +test_expect_success 'omit-large-blobs omit 1 blob' ' + git ls-files -s large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1001 \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +test_expect_success 'omit-large-blobs omit 1 blob (1k)' ' + git ls-files -s large.10000 \ + | awk -f print_2.awk \ + | sort >expected && + git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1k \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +test_expect_success 'omit-large-blobs omit no blob (1m)' ' + cat expected && + git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1m \ + | awk -f print_1.awk \ + | sed "s/~//" >observed && + test_cmp observed expected +' + +# Test sparse-pattern filtering (using explicit local patterns). +# We use the same disk format as sparse-checkout to specify the +# filtering, but do not require sparse-checkout to be enabled. + +test_expect_success 'setup using sparse file' ' + mkdir dir1 && + for n in sparse1 sparse2 + do + echo $n > $n + git add $n + echo dir1/$n > dir1/$n + git add dir1/$n + done && + git commit -m "sparse" && + echo dir1/ >pattern1 && + echo sparse1 >pattern2 +' + +# pattern1 should only include the 2 dir1/* files. +# and omit the 5 file.*, 2 large.*, and 2 top-level sparse* files. +test_expect_success 'sparse using path pattern1' ' + git rev-list HEAD --objects --filter-print-omitted --filter-use-path=pattern1 >out && + + grep "^~" out >blobs.omitted && + test $(cat blobs.omitted | wc -l) = 9 && + + grep "dir1/sparse" out >blobs.included && + test $(cat blobs.included | wc -l) = 2 +' + +# pattern2 should include the sparse1 and dir1/sparse1. +# and omit the 5 file.*, 2 large.*, and the 2 sparse2 files. +test_expect_success 'sparse using path pattern2' ' + git rev-list HEAD --objects --filter-print-omitted --filter-use-path=pattern2 >out && + + grep "^~" out >blobs.omitted && + test $(cat blobs.omitted | wc -l) = 9 && + + grep "sparse1" out >blobs.included && + test $(cat blobs.included | wc -l) = 2 +' + +# Test sparse-pattern filtering (using a blob in the repo). +# This could be used to later let pack-objects do filtering. + +# pattern1 should only include the 2 dir1/* files. +# and omit the 5 file.*, 2 large.*, 2 top-level sparse*, and 1 pattern file. +test_expect_success 'sparse using OID for pattern1' ' + git add pattern1 && + git commit -m "pattern1" && + + git rev-list HEAD --objects >normal.output && + grep "pattern1" pattern1.oid && + + git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=`cat pattern1.oid` >out && + + grep "^~" out >blobs.omitted && + test $(cat blobs.omitted | wc -l) = 10 && + + grep "dir1/sparse" out >blobs.included && + test $(cat blobs.included | wc -l) = 2 +' + +# repeat previous test but use blob-ish expression rather than OID. +test_expect_success 'sparse using blob-ish to get OID for pattern spec' ' + git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=HEAD:pattern1 >out && + + grep "^~" out >blobs.omitted && + test $(cat blobs.omitted | wc -l) = 10 && + + grep "dir1/sparse" out >blobs.included && + test $(cat blobs.included | wc -l) = 2 +' + +# pattern2 should include the sparse1 and dir1/sparse1. +# and omit the 5 file.*, 2 large.*, 2 top-level sparse*, and 2 pattern files. +test_expect_success 'sparse using OID for pattern2' ' + git add pattern2 && + git commit -m "pattern2" && + + git rev-list HEAD --objects >normal.output && + grep "pattern2" pattern2.oid && + + git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=`cat pattern2.oid` >out && + + grep "^~" out >blobs.omitted && + test $(cat blobs.omitted | wc -l) = 11 && + + grep "sparse1" out >blobs.included && + test $(cat blobs.included | wc -l) = 2 +' + +# repeat previous test but use blob-ish expression rather than OID. +test_expect_success 'sparse using blob-ish rather than OID for pattern2' ' + git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=HEAD:pattern2 >out && + + grep "^~" out >blobs.omitted && + test $(cat blobs.omitted | wc -l) = 11 && + + grep "sparse1" out >blobs.included && + test $(cat blobs.included | wc -l) = 2 +' + +# delete some loose objects and test rev-list printing them as missing. +test_expect_success 'print missing objects' ' + git ls-files -s file.1 file.2 file.3 file.4 file.5 \ + | awk -f print_2.awk \ + | sort >expected && + for id in `cat expected | sed "s|..|&/|"` + do + rm .git/objects/$id + done && + git rev-list --quiet HEAD --filter-print-missing --objects \ + | awk -f print_1.awk \ + | sed "s/?//" \ + | sort >observed && + test_cmp observed expected +' + +test_done