diff --git a/common/arg.cpp b/common/arg.cpp index 56827a65908be..d70d617f81001 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2647,6 +2647,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(common_arg( + {"--show-statistics"}, + string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"), + [](common_params & params) { + params.show_statistics = true; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--parse-special"}, string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"), diff --git a/common/common.h b/common/common.h index a5abe32859fdd..85182f2c7dddb 100644 --- a/common/common.h +++ b/common/common.h @@ -420,9 +420,10 @@ struct common_params { int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations int32_t i_chunk = 0; // start processing from this chunk - bool process_output = false; // collect data for the output tensor - bool compute_ppl = true; // whether to compute perplexity - bool parse_special = false; // whether to parse special tokens during imatrix tokenization + bool process_output = false; // collect data for the output tensor + bool compute_ppl = true; // whether to compute perplexity + bool show_statistics = false; // show imatrix statistics per tensor + bool parse_special = false; // whether to parse special tokens during imatrix tokenization // cvector-generator params int n_pca_batch = 100; diff --git a/tools/imatrix/README.md b/tools/imatrix/README.md index 6d8897d98bb61..c5ac382bf5005 100644 --- a/tools/imatrix/README.md +++ b/tools/imatrix/README.md @@ -1,33 +1,80 @@ # llama.cpp/tools/imatrix Compute an importance matrix for a model and given text dataset. Can be used during quantization to enhance the quality of the quantized models. -More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861 +More information is [available here](https://github.com/ggml-org/llama.cpp/pull/4861) ## Usage ``` ./llama-imatrix \ - -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \ - [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \ - [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] + -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \ + [--chunk 123] [--output-frequency 10] [--save-frequency 0] [--show-statistics] \ + [--no-ppl] [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] \ + [--parse-special] [...] ``` -Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory. +Here `-m | --model` with a model name and `-f | --file` with a file containing calibration data (such as e.g. `wiki.train.raw`) are mandatory. The parameters in square brackets are optional and have the following meaning: -* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. -* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. -* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) +* `-h | --help` shows usage information and exits. +* `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. +* `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. +* `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never) -* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. +* `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. +* `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets. +* `--parse-special` enables parsing of special tokens (e.g., `<|im_start|>` in some models). Useful for models with custom tokenizers. +* `--chunk` to skip the first `n` chunks of tokens from the input data. Useful for resuming or skipping initial low-quality data. +* `-n | --n-chunks` maximum number of chunks to process. Default is -1 for all available chunks. +* `--no-ppl` disables the calculation of perplexity for the processed chunks. Useful if you want to speed up the processing and do not care about perplexity. +* `--show-statistics` displays imatrix file's statistics. -For faster computation, make sure to use GPU offloading via the `-ngl` argument +For faster computation, make sure to use GPU offloading via the `-ngl | --n-gpu-layers` argument -## Example +## Examples ```bash -# generate importance matrix (imatrix.dat) -./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 +# generate importance matrix using default filename (imatrix.dat), offloading 99 layers to GPU +./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -ngl 99 # use the imatrix to perform a Q4_K_M quantization ./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m ``` + +```bash +# combine Existing imatrices +./llama-imatrix --in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat -o imatrix-combined.dat +``` + +```bash +# skip first 5 chunks, save intermediates every 20 chunks and snapshots every 50, parsing special tokens +./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --chunk 5 --output-frequency 20 --save-frequency 50 --parse-special +``` + +```bash +# analyse imatrix file and display summary statistics instead of running inference +./llama-imatrix --in-file imatrix.dat --show-statistics +``` + +`--show-statistics` will display the following statistics: + +#### Per tensor + +* Σ(Act²): sum of all squared activations (the importance scores) +* Min & Max: minimum and maximum squared activations values +* μ & σ: Squared activations' mean and standard deviation +* % Active: proportion of elements whose average squared activation exceeds a small threshold (1e-5). Helpful to determine how alive/dormant the tensor is during inference +* N: number of squared activations +* Entropy: entropy of the squared activation distribution, in bits (standard Shannon entropy measurement) $S = -\sum_{i=1}^N p_i \log_2 p_i$ +* E (norm): Normalized entropy. $E(norm)=\frac{-\sum_{i=1}^N p_i \log_2 p_i}{log_2 N}$. These two metrics can be used to determine how well a prompt "exercises" the model's capabilities +* ZD Score: z-score distribution as described in _3.1 Layer Importance Scores_ of [Layer-Wise Quantization](https://arxiv.org/abs/2406.17415) +* CosSim: cosine similarity with respect to the previous layer's tensor. Useful to determine how similar the squared activations of the current layer are to the previous layer's squared activations. + +#### Per layer + +Weighted averages of Σ(Act²), ZD Score and CosSim are also calculated. + +#### Important note on the computed Statistics + +When using these statistics, please note that they are computed on the squared activations, **not on the actual (raw) activations**. +Whilst the results are still useful, they're less accurate than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors. +This limitation is due to the current implementation of the importance matrix, but a pull request ([use GGUF to store importance matrices](https://github.com/ggml-org/llama.cpp/pull/9400)) aims to address this. diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index daad44e59579f..fd3c5863b1ac2 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -13,7 +13,10 @@ #include #include #include +#include #include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -21,11 +24,10 @@ static void print_usage(int, char ** argv) { LOG("\nexample usage:\n"); - LOG("\n %s \\\n" - " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n" - " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" - " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] \\\n" - " [--parse-special]\n" , argv[0]); + LOG("\n %s -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output]\n" + " [--chunk 123] [--output-frequency 10] [--save-frequency 0] [--show-statistics]\n" + " [--no-ppl] [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" + " [--parse-special] [...]\n" , argv[0]); LOG("\n"); } @@ -35,6 +37,21 @@ struct Stats { int ncall = 0; }; +struct tensor_statistics { + std::string tensor; + Stats stats; + float total_sqract = 0.0f; + float mean_sqract = 0.0f; + float max_sqract = 0.0f; + float min_sqract = 0.0f; + int elements = 0; + float stddev = 0.0f; + float active = 0.0f; + float entropy = 0.0f; + float zd = 0.0f; + float cossim = 0.0f; +}; + class IMatrixCollector { public: IMatrixCollector() = default; @@ -42,6 +59,7 @@ class IMatrixCollector { bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; bool load_imatrix(const char * fname); + const std::unordered_map & get_mstats() const { return m_stats; } private: std::unordered_map m_stats; common_params m_params; @@ -70,6 +88,121 @@ static std::string filter_tensor_name(const char * name) { return wname; } +static void process_tensor_name(const std::string & input, std::string & layer, std::string & tensor) { + std::vector name; + std::istringstream stream(input); + std::string item; + + while (std::getline(stream, item, '.')) { + name.push_back(item); + } + for (size_t i = 0; i < name.size(); ++i) { + if (name[i] == "blk" && i + 1 < name.size()) { + layer = name[i + 1]; + break; + } + } + for (size_t i = 0; i < name.size(); ++i) { + if (name[i] == "weight" && i > 0) { + tensor = name[i - 1]; + break; + } + } + + if (tensor.empty()) { + tensor = input; + } + if (layer.empty()) { + layer = "-"; + } +} + +static void compute_statistics(std::vector & tstats, const std::string & name, const Stats & e) { + if (e.values.size() != e.counts.size()) { + LOG_ERR("%s: activation size mismatch for tensor %s (%zu vs %zu)\n", __func__, name.c_str(), e.counts.size(), e.values.size()); + return; + } + if (e.counts.empty()) { + LOG_ERR("%s: there are no activations for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str()); + return; + } + + const int size = e.counts.size(); + std::vector activations; + activations.reserve(size); + for (int i = 0; i < size; i++) { + activations.push_back(e.values[i] / e.counts[i]); + } + + const float act_total = std::accumulate(activations.begin(), activations.end(), 0.0f); + const float act_max = *std::max_element(activations.begin(), activations.end()); + const float act_min = *std::min_element(activations.begin(), activations.end()); + const float act_mean = act_total / activations.size(); + const float act_sqr_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f); + const float act_var = (act_sqr_total / activations.size()) - (act_mean * act_mean); + const float act_dev = std::sqrt(std::max(0.0f, act_var)); + float threshold = 1e-5f; + const int inactive_count = std::count_if(activations.begin(), activations.end(), + [threshold](const float v) { return fabsf(v) <= threshold; }); + const float active_ratio = 1 - static_cast(inactive_count) / activations.size(); + + float entropy = 0; + if (act_total > 0) { + for (const auto act : activations) { + if (const float p = act / act_total; p > 0) { + entropy -= p * std::log2(p); + } + } + } + + int z_score = 0; + if (act_dev > 0.0f) { + for (const auto act : activations) { + if (const float p = (act - act_mean) / act_dev; p > 1) { + z_score++; + } + } + } + + auto & ts = tstats.emplace_back(); + ts.tensor = name; + ts.stats = e; + ts.total_sqract = act_total; + ts.mean_sqract = act_mean; + ts.max_sqract = act_max; + ts.min_sqract = act_min; + ts.elements = static_cast(activations.size()); + ts.stddev = act_dev; + ts.active = active_ratio; + ts.entropy = entropy; + ts.zd = static_cast(z_score) / ts.elements; +} + +static void compute_cossim(std::vector & tstats) { + static const std::regex pattern(R"(blk\.(\d+)\.)"); + for (auto & ts : tstats) { + if (std::smatch match; std::regex_search(ts.tensor, match, pattern)) { + const int blk = std::stoi(match[1]); + std::string tname(ts.tensor); + tname.replace(match.position(1), match.length(1), std::to_string(blk-1)); + auto prev = std::find_if(tstats.begin(), tstats.end(), + [tname](const tensor_statistics & t) { return t.tensor == tname; }); + if (prev != tstats.end()) { + const float dp = std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), + prev->stats.values.begin(), 0.0f); + const float curr_mag = std::sqrt(std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), + ts.stats.values.begin(), 0.0f)); + const float prev_mag = std::sqrt(std::inner_product(prev->stats.values.begin(), prev->stats.values.end(), + prev->stats.values.begin(), 0.0f)); + const float cs = dp / (curr_mag * prev_mag); + ts.cossim = cs; + } + } else { + ts.cossim = 0; + } + } +} + bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { GGML_UNUSED(user_data); @@ -304,6 +437,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { LOG_ERR("%s: no data in file %s\n", __func__, fname); return false; } + for (int i = 0; i < n_entries; ++i) { int len; in.read((char *)&len, sizeof(len)); std::vector name_as_vec(len+1); @@ -338,14 +472,14 @@ bool IMatrixCollector::load_imatrix(const char * fname) { return false; } - // Recreate the state as expected by save_imatrix(), and corerct for weighted sum. + // Recreate the state as expected by save_imatrix(), and correct for weighted sum. for (int i = 0; i < nval; i++) { e.values[i] += tmp[i]; e.counts[i] += ncall; } e.ncall += ncall; - } + return true; } @@ -355,7 +489,6 @@ static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_dat return g_collector.collect_imatrix(t, ask, user_data); } - struct results_log_softmax { double log_softmax; float logit; @@ -580,11 +713,116 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { return true; } +static bool show_statistics(const common_params & params) { + std::vector ts; + if (params.in_files.empty() || params.in_files.size() > 1) { + LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n"); + return false; + } + if (g_collector.load_imatrix(params.in_files[0].c_str())) { + for (const auto & [name, stats] :g_collector.get_mstats()) { + compute_statistics(ts, name, stats); + } + } else { + LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str()); + return false; + } + if (!ts.empty()) { + compute_cossim(ts); + } else { + LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str()); + return false; + } + + struct tensor_comparer { + bool operator()(const tensor_statistics & a, const tensor_statistics & b) const { + std::string layer, name_a, name_b; + ; + process_tensor_name(a.tensor, layer, name_a); + process_tensor_name(b.tensor, layer, name_b); + return name_a < name_b || (name_a == name_b && a.total_sqract > b.total_sqract); + } + }; + std::sort(ts.begin(), ts.end(), tensor_comparer()); + + struct weighted_stats { + float weighted_bias = 0.0f; + float weighted_zd = 0.0f; + float weighted_cossim = 0.0f; + int total_elements = 0; + }; + std::map ws; + + LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(ts.size())); + LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", " Tensor", " Σ(Act²)", + " Min", " Max", " μ", " σ", " % Active", "N", " Entropy", "E (norm)", "ZD", + " CosSim"); + LOG_INF( + "==============================================================================================================" + "===========================================================\n"); + for (const auto & tstat : ts) { + std::string layer, name; + process_tensor_name(tstat.tensor, layer, name); + + int blk; + try { + blk = std::stoi(layer); + } catch (const std::exception & e) { + blk = -1; // not a block layer + } + + LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n", + layer.c_str(), name.c_str(), tstat.total_sqract, tstat.min_sqract, tstat.max_sqract, tstat.mean_sqract, + tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy, + 100.0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim); + + const float weighted_bias = tstat.elements * tstat.total_sqract; + const float weighted_zd = tstat.elements * tstat.zd; + const float weighted_cossim = tstat.elements * tstat.cossim; + + if (ws.find(blk) != ws.end()) { + ws[blk].weighted_bias += weighted_bias; + ws[blk].weighted_zd += weighted_zd; + ws[blk].weighted_cossim += weighted_cossim; + ws[blk].total_elements += tstat.elements; + } else { + weighted_stats temp_ws; + temp_ws.weighted_bias = weighted_bias; + temp_ws.weighted_zd = weighted_zd; + temp_ws.weighted_cossim = weighted_cossim; + temp_ws.total_elements = tstat.elements; + ws[blk] = temp_ws; + } + } + + const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; }); + LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers); + LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " μΣ(Act²)", " μZD", "μCosSim"); + LOG_INF("================================================\n"); + for (const auto & [first, second] : ws) { + const auto & layer = first; + const auto & stats = second; + + if (stats.total_elements == 0) { + continue; + } + + if (layer >= 0) { + const float bias = stats.weighted_bias / stats.total_elements; + const float zd = stats.weighted_zd / stats.total_elements; + const float cossim = stats.weighted_cossim / stats.total_elements; + + LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, 100.0f * zd, cossim); + } + } + LOG_INF("\n"); + + return true; +} + int main(int argc, char ** argv) { common_params params; - params.out_file = "imatrix.dat" ; - params.n_ctx = 512; params.escape = false; @@ -592,10 +830,15 @@ int main(int argc, char ** argv) { return 1; } - common_init(); + if (params.show_statistics) { + if (!show_statistics(params)) { + return 1; + } + return 0; + } + common_init(); params.n_batch = std::min(params.n_batch, params.n_ctx); - g_collector.set_params(params); for (const auto & in_file : params.in_files) { @@ -655,12 +898,10 @@ int main(int argc, char ** argv) { } } - g_collector.save_imatrix(); LOG("\n"); llama_perf_context_print(ctx); - llama_backend_free(); return 0;