From 021e6d9944f4e0b0da36de9566e5ad79a6b3ab27 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 16 May 2023 03:09:34 +0300 Subject: [PATCH 1/8] Steering --- examples/common.cpp | 24 ++++++++++++++++++++++ examples/common.h | 5 +++++ examples/main/main.cpp | 22 ++++++++++++++++++++ llama.cpp | 46 ++++++++++++++++++++++++++++++++++++++++++ llama.h | 3 +++ 5 files changed, 100 insertions(+) diff --git a/examples/common.cpp b/examples/common.cpp index 259880a7cc64f..eea8500c434b6 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -344,6 +344,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.input_suffix = argv[i]; + } else if (arg == "--steering-add") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.steering_add = argv[i]; + } else if (arg == "--steering-sub") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.steering_sub = argv[i]; + } else if (arg == "--steering-mul") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.steering_mul = std::stof(argv[i]); + } else if (arg == "--steering-lyr") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.steering_lyr = std::stoi(argv[i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); gpt_print_usage(argc, argv, default_params); diff --git a/examples/common.h b/examples/common.h index 717838f06e064..f2c836ae37a61 100644 --- a/examples/common.h +++ b/examples/common.h @@ -72,6 +72,11 @@ struct gpt_params { bool use_mlock = false; // use mlock to keep model in memory bool mem_test = false; // compute maximum memory usage bool verbose_prompt = false; // print prompt tokens before generation + + std::string steering_add = ""; + std::string steering_sub = ""; + float steering_mul = 1.0f; + int steering_lyr = 20; }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 8543414dd0fbb..8ae64b93ceb5f 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -136,6 +136,28 @@ int main(int argc, char ** argv) { return 0; } + if (params.steering_add.size() || params.steering_sub.size()) + { + auto steering_add_tokens = ::llama_tokenize(ctx, params.steering_add, true); + auto steering_sub_tokens = ::llama_tokenize(ctx, params.steering_sub, true); + + if (steering_add_tokens.size() != steering_sub_tokens.size()) { + llama_token space; + llama_tokenize(ctx, " ", &space, 1, 0); + + while (steering_add_tokens.size() < steering_sub_tokens.size()) steering_add_tokens.push_back(space); + while (steering_sub_tokens.size() < steering_add_tokens.size()) steering_sub_tokens.push_back(space); + } + + llama_set_steering_write(ctx, params.steering_lyr, params.steering_mul/2); + llama_eval(ctx, steering_add_tokens.data(), std::min((int)steering_add_tokens.size(), params.n_ctx), 0, params.n_threads); + + llama_set_steering_write(ctx, params.steering_lyr, -params.steering_mul/2); + llama_eval(ctx, steering_sub_tokens.data(), std::min((int)steering_sub_tokens.size(), params.n_ctx), 0, params.n_threads); + + llama_set_steering_read(ctx, params.steering_lyr, 1); + } + // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); diff --git a/llama.cpp b/llama.cpp index 98f49abd7cf48..61afe7d623b43 100644 --- a/llama.cpp +++ b/llama.cpp @@ -229,6 +229,15 @@ struct llama_context { // input embedding (1-dimensional array: [n_embd]) std::vector embedding; + std::vector steering_vector; // [n_ctx, n_embd] + int steering_layer = 0; + int steering_mode = 0; + float steering_mul = 0.0f; + + #define STEERING_OFF 0 + #define STEERING_WRITE 2 + #define STEERING_READ 3 + // memory buffers used to evaluate the model // TODO: move in llama_state llama_ctx_buffer buf_compute; @@ -269,6 +278,17 @@ struct llama_context { } }; +void llama_set_steering_write(struct llama_context * ctx, int layer, float mul) { + ctx->steering_mode = STEERING_WRITE; + ctx->steering_mul = mul; + ctx->steering_layer = layer; +} +void llama_set_steering_read(struct llama_context * ctx, int layer, float mul) { + ctx->steering_mode = STEERING_READ; + ctx->steering_mul = mul; + ctx->steering_layer = layer; +} + template static T checked_mul(T a, T b) { T ret = a * b; @@ -1141,6 +1161,12 @@ static bool llama_eval_internal( ggml_set_name(embd, "embd"); memcpy(embd->data, tokens, N*ggml_element_size(embd)); + struct ggml_tensor * steer; + if (lctx.steering_mode != STEERING_OFF) { + steer = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_embd); + memcpy(steer->data, lctx.steering_vector.data(), ggml_nbytes(steer)); + } + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); for (int il = 0; il < n_layer; ++il) { @@ -1150,6 +1176,18 @@ static bool llama_eval_internal( lctx.use_buf(ctx0, 0); + if (lctx.steering_mode != STEERING_OFF && il == lctx.steering_layer) { + steer->data = lctx.steering_vector.data(); + + struct ggml_tensor * src = ggml_scale(ctx0, inpL, ggml_new_f32(ctx0, lctx.steering_mul)); + struct ggml_tensor * dst = ggml_view_2d(ctx0, steer, n_embd, N, n_embd * sizeof(float), n_past * n_embd * sizeof(float)); + if (lctx.steering_mode == STEERING_WRITE) { + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, ggml_add(ctx0, src, dst), dst)); + } else { + inpL = src; + } + } + // norm { cur = ggml_rms_norm(ctx0, inpL); @@ -1363,6 +1401,12 @@ static bool llama_eval_internal( memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); } + + if (lctx.steering_mode == STEERING_WRITE) { + memcpy(lctx.steering_vector.data(), steer->data, ggml_nbytes(steer)); + } + + if (mem_per_token == 0) { mem_per_token = ggml_used_mem(ctx0)/N; } @@ -2184,6 +2228,8 @@ struct llama_context * llama_init_from_file( ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); + + ctx->steering_vector.resize(hparams.n_ctx * hparams.n_embd); } return ctx; diff --git a/llama.h b/llama.h index 21cba8cf61061..99882ec396b43 100644 --- a/llama.h +++ b/llama.h @@ -191,6 +191,9 @@ extern "C" { LLAMA_API llama_token llama_token_eos(); LLAMA_API llama_token llama_token_nl(); + LLAMA_API void llama_set_steering_write(struct llama_context * ctx, int layer, float mul); + LLAMA_API void llama_set_steering_read(struct llama_context * ctx, int layer, float mul); + // Sampling functions /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. From 8388aaa604ce25d7b036b475d01825e1977187fb Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 16 May 2023 15:16:00 +0300 Subject: [PATCH 2/8] cleanup and stuff --- examples/common.cpp | 8 +++++-- examples/common.h | 6 +++--- examples/main/main.cpp | 48 +++++++++++++++++++++++------------------- llama.cpp | 23 +++++++++++--------- 4 files changed, 48 insertions(+), 37 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index eea8500c434b6..aaf6e27a9886d 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -362,12 +362,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.steering_mul = std::stof(argv[i]); - } else if (arg == "--steering-lyr") { + } else if (arg == "--steering-layer") { if (++i >= argc) { invalid_param = true; break; } - params.steering_lyr = std::stoi(argv[i]); + params.steering_layer = std::stoi(argv[i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); gpt_print_usage(argc, argv, default_params); @@ -454,6 +454,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { } fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); fprintf(stderr, " number of layers to store in VRAM\n"); + fprintf(stderr, " --steering-add add positive steering prompt\n"); + fprintf(stderr, " --steering-sub add negativ steering prompt\n"); + fprintf(stderr, " --steering-mul set steering strength (negative is reverse, default %.1f)\n", params.steering_mul); + fprintf(stderr, " --steering-layer set layer for steering (default %d)\n", params.steering_layer); fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); diff --git a/examples/common.h b/examples/common.h index f2c836ae37a61..e56ad648ea1be 100644 --- a/examples/common.h +++ b/examples/common.h @@ -73,10 +73,10 @@ struct gpt_params { bool mem_test = false; // compute maximum memory usage bool verbose_prompt = false; // print prompt tokens before generation - std::string steering_add = ""; - std::string steering_sub = ""; + std::string steering_add; + std::string steering_sub; float steering_mul = 1.0f; - int steering_lyr = 20; + int steering_layer = 15; }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 8ae64b93ceb5f..ffa779e050e6b 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -136,28 +136,6 @@ int main(int argc, char ** argv) { return 0; } - if (params.steering_add.size() || params.steering_sub.size()) - { - auto steering_add_tokens = ::llama_tokenize(ctx, params.steering_add, true); - auto steering_sub_tokens = ::llama_tokenize(ctx, params.steering_sub, true); - - if (steering_add_tokens.size() != steering_sub_tokens.size()) { - llama_token space; - llama_tokenize(ctx, " ", &space, 1, 0); - - while (steering_add_tokens.size() < steering_sub_tokens.size()) steering_add_tokens.push_back(space); - while (steering_sub_tokens.size() < steering_add_tokens.size()) steering_sub_tokens.push_back(space); - } - - llama_set_steering_write(ctx, params.steering_lyr, params.steering_mul/2); - llama_eval(ctx, steering_add_tokens.data(), std::min((int)steering_add_tokens.size(), params.n_ctx), 0, params.n_threads); - - llama_set_steering_write(ctx, params.steering_lyr, -params.steering_mul/2); - llama_eval(ctx, steering_sub_tokens.data(), std::min((int)steering_sub_tokens.size(), params.n_ctx), 0, params.n_threads); - - llama_set_steering_read(ctx, params.steering_lyr, 1); - } - // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); @@ -196,6 +174,32 @@ int main(int argc, char ** argv) { return 1; } + if (!params.steering_add.empty() || !params.steering_sub.empty()) + { + params.steering_add.insert(0, 1, ' '); + params.steering_sub.insert(0, 1, ' '); + + auto add_tokens = ::llama_tokenize(ctx, params.steering_add, true); + auto sub_tokens = ::llama_tokenize(ctx, params.steering_sub, true); + + //if (add_tokens.size() != sub_tokens.size()) { + // while (add_tokens.size() < sub_tokens.size()) { + // add_tokens.push_back(llama_token_nl()); + // } + // while (sub_tokens.size() < add_tokens.size()) { + // sub_tokens.push_back(llama_token_nl()); + // } + //} + //const int N = embd_inp.size(); + llama_set_steering_write(ctx, params.steering_layer, +1.0f); + llama_eval(ctx, add_tokens.data(), std::min((int)add_tokens.size(), n_ctx), 0, params.n_threads); + + llama_set_steering_write(ctx, params.steering_layer, -1.0f); + llama_eval(ctx, sub_tokens.data(), std::min((int)sub_tokens.size(), n_ctx), 0, params.n_threads); + + llama_set_steering_read(ctx, params.steering_layer, params.steering_mul); + } + // debug message about similarity of saved session, if applicable size_t n_matching_session_tokens = 0; if (session_tokens.size()) { diff --git a/llama.cpp b/llama.cpp index 61afe7d623b43..5e85e55d5f165 100644 --- a/llama.cpp +++ b/llama.cpp @@ -287,6 +287,9 @@ void llama_set_steering_read(struct llama_context * ctx, int layer, float mul) { ctx->steering_mode = STEERING_READ; ctx->steering_mul = mul; ctx->steering_layer = layer; + //FILE* steeringbin = fopen("steering.bin", "wb"); + //fwrite(ctx->steering_vector.data(), sizeof(float), ctx->steering_vector.size(), steeringbin); + //fclose(steeringbin); } template @@ -1163,8 +1166,9 @@ static bool llama_eval_internal( struct ggml_tensor * steer; if (lctx.steering_mode != STEERING_OFF) { - steer = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_embd); - memcpy(steer->data, lctx.steering_vector.data(), ggml_nbytes(steer)); + steer = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + //steer->data = lctx.steering_vector.data() + n_past * n_embd * sizeof(float); + memcpy(steer->data, lctx.steering_vector.data() + n_past * n_embd * sizeof(float), ggml_nbytes(steer)); } struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); @@ -1177,15 +1181,14 @@ static bool llama_eval_internal( lctx.use_buf(ctx0, 0); if (lctx.steering_mode != STEERING_OFF && il == lctx.steering_layer) { - steer->data = lctx.steering_vector.data(); - - struct ggml_tensor * src = ggml_scale(ctx0, inpL, ggml_new_f32(ctx0, lctx.steering_mul)); - struct ggml_tensor * dst = ggml_view_2d(ctx0, steer, n_embd, N, n_embd * sizeof(float), n_past * n_embd * sizeof(float)); + struct ggml_tensor * scal = ggml_new_f32(ctx0, lctx.steering_mul); if (lctx.steering_mode == STEERING_WRITE) { - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, ggml_add(ctx0, src, dst), dst)); - } else { - inpL = src; + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, + ggml_add(ctx0, ggml_scale(ctx0, inpL, scal), steer), steer)); + break; } + + inpL = ggml_add(ctx0, ggml_scale(ctx0, steer, scal), inpL); } // norm @@ -1403,7 +1406,7 @@ static bool llama_eval_internal( if (lctx.steering_mode == STEERING_WRITE) { - memcpy(lctx.steering_vector.data(), steer->data, ggml_nbytes(steer)); + memcpy(lctx.steering_vector.data() + n_past * n_embd * sizeof(float), steer->data, ggml_nbytes(steer)); } From c90059fba6f2fa8374fdf5488126bbf5a998cf7a Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 16 May 2023 18:43:40 +0300 Subject: [PATCH 3/8] separate source layer for steering vector. --- examples/common.cpp | 11 +++++++++-- examples/common.h | 1 + examples/main/main.cpp | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index aaf6e27a9886d..da09853bda0eb 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -362,6 +362,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.steering_mul = std::stof(argv[i]); + } else if (arg == "--steering-source") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.steering_source = std::stoi(argv[i]); } else if (arg == "--steering-layer") { if (++i >= argc) { invalid_param = true; @@ -456,8 +462,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " number of layers to store in VRAM\n"); fprintf(stderr, " --steering-add add positive steering prompt\n"); fprintf(stderr, " --steering-sub add negativ steering prompt\n"); - fprintf(stderr, " --steering-mul set steering strength (negative is reverse, default %.1f)\n", params.steering_mul); - fprintf(stderr, " --steering-layer set layer for steering (default %d)\n", params.steering_layer); + fprintf(stderr, " --steering-mul steering strength (negative is reverse, default %.1f)\n", params.steering_mul); + fprintf(stderr, " --steering-source layer for steering source (default %d)\n", params.steering_source); + fprintf(stderr, " --steering-layer layer for steering insertion (default %d)\n", params.steering_layer); fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); diff --git a/examples/common.h b/examples/common.h index e56ad648ea1be..04883dcf3008f 100644 --- a/examples/common.h +++ b/examples/common.h @@ -77,6 +77,7 @@ struct gpt_params { std::string steering_sub; float steering_mul = 1.0f; int steering_layer = 15; + int steering_source = 2; }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index ffa779e050e6b..18280bde10e38 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -191,7 +191,7 @@ int main(int argc, char ** argv) { // } //} //const int N = embd_inp.size(); - llama_set_steering_write(ctx, params.steering_layer, +1.0f); + llama_set_steering_write(ctx, params.steering_source, +1.0f); llama_eval(ctx, add_tokens.data(), std::min((int)add_tokens.size(), n_ctx), 0, params.n_threads); llama_set_steering_write(ctx, params.steering_layer, -1.0f); From 1b0ff2cf6a808262a2617bd2830841884caee8fb Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 17 May 2023 10:39:18 +0300 Subject: [PATCH 4/8] Update examples/common.cpp Fix typo Co-authored-by: Extra Dosages --- examples/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/common.cpp b/examples/common.cpp index da09853bda0eb..bf72da3cc3998 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -461,7 +461,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); fprintf(stderr, " number of layers to store in VRAM\n"); fprintf(stderr, " --steering-add add positive steering prompt\n"); - fprintf(stderr, " --steering-sub add negativ steering prompt\n"); + fprintf(stderr, " --steering-sub add negative steering prompt\n"); fprintf(stderr, " --steering-mul steering strength (negative is reverse, default %.1f)\n", params.steering_mul); fprintf(stderr, " --steering-source layer for steering source (default %d)\n", params.steering_source); fprintf(stderr, " --steering-layer layer for steering insertion (default %d)\n", params.steering_layer); From 7f59af52a90b8011005a8a4aefa109f612cb490d Mon Sep 17 00:00:00 2001 From: Laura Date: Thu, 18 May 2023 23:47:10 +0200 Subject: [PATCH 5/8] Steer with inpSA instead of with inpL Signed-off-by: Henri Vasserman --- examples/main/main.cpp | 25 ++++++++++++------------- llama.cpp | 5 +++-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 18280bde10e38..974e1277bdcbd 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -176,28 +176,27 @@ int main(int argc, char ** argv) { if (!params.steering_add.empty() || !params.steering_sub.empty()) { - params.steering_add.insert(0, 1, ' '); - params.steering_sub.insert(0, 1, ' '); - auto add_tokens = ::llama_tokenize(ctx, params.steering_add, true); auto sub_tokens = ::llama_tokenize(ctx, params.steering_sub, true); - //if (add_tokens.size() != sub_tokens.size()) { - // while (add_tokens.size() < sub_tokens.size()) { - // add_tokens.push_back(llama_token_nl()); - // } - // while (sub_tokens.size() < add_tokens.size()) { - // sub_tokens.push_back(llama_token_nl()); - // } - //} - //const int N = embd_inp.size(); + + if (add_tokens.size() != sub_tokens.size()) { + while (add_tokens.size() < sub_tokens.size()) { + add_tokens.push_back(llama_token_nl()); + } + while (sub_tokens.size() < add_tokens.size()) { + sub_tokens.push_back(llama_token_nl()); + } + } + llama_set_steering_write(ctx, params.steering_source, +1.0f); llama_eval(ctx, add_tokens.data(), std::min((int)add_tokens.size(), n_ctx), 0, params.n_threads); - llama_set_steering_write(ctx, params.steering_layer, -1.0f); + llama_set_steering_write(ctx, params.steering_source, -1.0f); llama_eval(ctx, sub_tokens.data(), std::min((int)sub_tokens.size(), n_ctx), 0, params.n_threads); llama_set_steering_read(ctx, params.steering_layer, params.steering_mul); + std::cout << "Steering: `" << params.steering_add << "` - `" << params.steering_sub << "` * " << params.steering_mul << "\n"; } // debug message about similarity of saved session, if applicable diff --git a/llama.cpp b/llama.cpp index 5e85e55d5f165..a02ef4cb8db3f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 @@ -1187,8 +1188,8 @@ static bool llama_eval_internal( ggml_add(ctx0, ggml_scale(ctx0, inpL, scal), steer), steer)); break; } - - inpL = ggml_add(ctx0, ggml_scale(ctx0, steer, scal), inpL); + // std::cout << "\nAdding steering vector to inpL " << il << "\n"; + inpSA = ggml_add(ctx0, ggml_scale(ctx0, steer, scal), inpSA); } // norm From 7df9ab96873d0b451ca22a76b4b4414e85b1072a Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 19 May 2023 01:47:26 +0300 Subject: [PATCH 6/8] clean up --- examples/main/main.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 974e1277bdcbd..bf940978ac6cf 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -176,17 +176,23 @@ int main(int argc, char ** argv) { if (!params.steering_add.empty() || !params.steering_sub.empty()) { + fprintf(stderr, "%s: steering: ('%s' - '%s') * %f\n", + __func__, params.steering_add.c_str(), params.steering_sub.c_str(), params.steering_mul); + + params.steering_add.insert(0, 1, ' '); + params.steering_sub.insert(0, 1, ' '); + auto add_tokens = ::llama_tokenize(ctx, params.steering_add, true); auto sub_tokens = ::llama_tokenize(ctx, params.steering_sub, true); if (add_tokens.size() != sub_tokens.size()) { - while (add_tokens.size() < sub_tokens.size()) { - add_tokens.push_back(llama_token_nl()); - } - while (sub_tokens.size() < add_tokens.size()) { - sub_tokens.push_back(llama_token_nl()); - } + while (add_tokens.size() < sub_tokens.size()) { + add_tokens.push_back(llama_token_nl()); + } + while (sub_tokens.size() < add_tokens.size()) { + sub_tokens.push_back(llama_token_nl()); + } } llama_set_steering_write(ctx, params.steering_source, +1.0f); @@ -196,7 +202,6 @@ int main(int argc, char ** argv) { llama_eval(ctx, sub_tokens.data(), std::min((int)sub_tokens.size(), n_ctx), 0, params.n_threads); llama_set_steering_read(ctx, params.steering_layer, params.steering_mul); - std::cout << "Steering: `" << params.steering_add << "` - `" << params.steering_sub << "` * " << params.steering_mul << "\n"; } // debug message about similarity of saved session, if applicable From 5c9b45c204cd9624a5ed656ad1ec558e7cb0da56 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 19 May 2023 16:44:32 +0300 Subject: [PATCH 7/8] Fix a very noobish C mistake Oops! --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index a02ef4cb8db3f..4b82c9ef85f82 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1169,7 +1169,7 @@ static bool llama_eval_internal( if (lctx.steering_mode != STEERING_OFF) { steer = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); //steer->data = lctx.steering_vector.data() + n_past * n_embd * sizeof(float); - memcpy(steer->data, lctx.steering_vector.data() + n_past * n_embd * sizeof(float), ggml_nbytes(steer)); + memcpy(steer->data, lctx.steering_vector.data() + n_past * n_embd, ggml_nbytes(steer)); } struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); @@ -1407,7 +1407,7 @@ static bool llama_eval_internal( if (lctx.steering_mode == STEERING_WRITE) { - memcpy(lctx.steering_vector.data() + n_past * n_embd * sizeof(float), steer->data, ggml_nbytes(steer)); + memcpy(lctx.steering_vector.data() + n_past * n_embd, steer->data, ggml_nbytes(steer)); } From da3d60f1547e9345f13a95abef4e8566a9c991b7 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Fri, 19 May 2023 17:24:43 +0300 Subject: [PATCH 8/8] turning off --- examples/main/main.cpp | 2 ++ llama.cpp | 4 ++++ llama.h | 1 + 3 files changed, 7 insertions(+) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index bf940978ac6cf..ab6d58e7d98e9 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -430,6 +430,8 @@ int main(int argc, char ** argv) { llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } + //llama_set_steering_off(ctx); + llama_token id = 0; { diff --git a/llama.cpp b/llama.cpp index 4b82c9ef85f82..dd46694183b44 100644 --- a/llama.cpp +++ b/llama.cpp @@ -279,6 +279,10 @@ struct llama_context { } }; +void llama_set_steering_off(struct llama_context * ctx) { + ctx->steering_mode = STEERING_OFF; +} + void llama_set_steering_write(struct llama_context * ctx, int layer, float mul) { ctx->steering_mode = STEERING_WRITE; ctx->steering_mul = mul; diff --git a/llama.h b/llama.h index 99882ec396b43..71036c31d308c 100644 --- a/llama.h +++ b/llama.h @@ -191,6 +191,7 @@ extern "C" { LLAMA_API llama_token llama_token_eos(); LLAMA_API llama_token llama_token_nl(); + LLAMA_API void llama_set_steering_off(struct llama_context * ctx); LLAMA_API void llama_set_steering_write(struct llama_context * ctx, int layer, float mul); LLAMA_API void llama_set_steering_read(struct llama_context * ctx, int layer, float mul);