Skip to content

Commit 09ab5c1

Browse files
committed
restore simple.cpp for now
1 parent fea4e9d commit 09ab5c1

File tree

1 file changed

+138
-130
lines changed

1 file changed

+138
-130
lines changed

examples/simple/simple.cpp

Lines changed: 138 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -1,173 +1,181 @@
1-
#include <stdio.h>
1+
#ifndef _GNU_SOURCE
2+
#define _GNU_SOURCE
3+
#endif
4+
5+
#include "common.h"
6+
#include "llama.h"
7+
#include "build-info.h"
8+
9+
#include <cassert>
10+
#include <cinttypes>
11+
#include <cmath>
12+
#include <cstdio>
13+
#include <cstring>
14+
#include <ctime>
15+
#include <fstream>
16+
#include <iostream>
217
#include <string>
318
#include <vector>
419

5-
#include "llama.h"
20+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
21+
#include <signal.h>
22+
#include <unistd.h>
23+
#elif defined (_WIN32)
24+
#define WIN32_LEAN_AND_MEAN
25+
#define NOMINMAX
26+
#include <windows.h>
27+
#include <signal.h>
28+
#endif
629

730

8-
void generate_sequence(llama_context * ctx, int n_ctx, const std::vector<llama_token>& prompt_tokens, float temperature) {
9-
// print the tokens from the prompt
10-
for (llama_token id : prompt_tokens) {
11-
printf("%s", llama_token_to_str(ctx, id));
12-
}
13-
fflush(stdout);
1431

15-
// the maximum number of tokens to generate at a time
16-
// TODO: not supported, remove
17-
const int CUDA_MAX_TOKENS = 1;
18-
llama_token tokens_out[CUDA_MAX_TOKENS];
32+
int main(int argc, char ** argv)
33+
{
34+
gpt_params params;
1935

20-
// current position in the context window
21-
int n_past = 0;
36+
//---------------------------------
37+
// Print help :
38+
//---------------------------------
2239

23-
// number of tokens to generate
24-
int n_tokens_out;
40+
if ( argc == 1 || argv[1][0] == '-' )
41+
{
42+
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
43+
return 1 ;
44+
}
2545

26-
// list of tokens to evaluate
27-
// note that at most llama_context_params::n_batch tokens can be evaluated at a time
28-
std::vector<llama_token> token_list = prompt_tokens;
46+
//---------------------------------
47+
// Load parameters :
48+
//---------------------------------
2949

30-
while (n_past < n_ctx) {
31-
// evaluate the tokens
50+
if ( argc >= 2 )
51+
{
52+
params.model = argv[1];
53+
}
3254

33-
// llama_eval generates one token at a time
34-
n_tokens_out = 1;
55+
if ( argc >= 3 )
56+
{
57+
params.prompt = argv[2];
58+
}
3559

36-
// number of threads to use for CPU evaluation - ignored if compiled with CUDA support
37-
const int n_threads = 4;
38-
// note: llama_eval is not compatible with GPU sampling
39-
if (llama_eval(ctx, token_list.data(), token_list.size(), n_past, n_threads)) {
40-
fprintf(stderr, "%s : failed to eval\n", __func__ );
41-
exit(1);
42-
}
60+
if ( params.prompt.empty() )
61+
{
62+
params.prompt = "Hello my name is";
63+
}
4364

44-
// perform sampling on the CPU
45-
float * logits = llama_get_logits(ctx);
46-
auto n_vocab = llama_n_vocab(ctx);
65+
//---------------------------------
66+
// Init LLM :
67+
//---------------------------------
4768

48-
// initialize candidate array from logits
49-
std::vector<llama_token_data> candidates;
50-
candidates.reserve(n_vocab);
51-
for(llama_token token_id = 0 ; token_id < n_vocab ; token_id++) {
52-
candidates.push_back(llama_token_data{ token_id, logits[token_id], 0.0f});
53-
}
69+
llama_backend_init(params.numa);
5470

55-
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
56-
57-
// sample token
58-
llama_sample_temperature(ctx, &candidates_p, temperature);
59-
tokens_out[0] = llama_sample_token(ctx, &candidates_p);
71+
llama_model * model;
72+
llama_context * ctx;
6073

61-
// increment the position in the context window
62-
n_past += token_list.size() + n_tokens_out - 1;
74+
std::tie(model, ctx) = llama_init_from_gpt_params( params );
6375

64-
token_list.clear();
76+
if ( model == NULL )
77+
{
78+
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
79+
return 1;
80+
}
6581

66-
// print the new tokens
67-
for (int i = 0; i < n_tokens_out; i++) {
68-
llama_token new_token_id = tokens_out[i];
82+
//---------------------------------
83+
// Tokenize the prompt :
84+
//---------------------------------
6985

70-
// is it an end of stream ?
71-
if (new_token_id == llama_token_eos()) {
72-
fprintf(stderr, " [end of text]\n");
73-
//return;
74-
}
86+
std::vector<llama_token> tokens_list;
87+
tokens_list = ::llama_tokenize( ctx , params.prompt , true );
7588

76-
// print the new token :
77-
printf("%s", llama_token_to_str(ctx, new_token_id));
78-
}
79-
fflush(stdout);
89+
const int max_context_size = llama_n_ctx( ctx );
90+
const int max_tokens_list_size = max_context_size - 4 ;
8091

81-
// push the last new token for the next evaluation
82-
token_list.push_back(tokens_out[n_tokens_out - 1]);
92+
if ( (int)tokens_list.size() > max_tokens_list_size )
93+
{
94+
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
95+
__func__ , (int)tokens_list.size() , max_tokens_list_size );
96+
return 1;
8397
}
84-
}
8598

86-
int main(int argc, char ** argv) {
87-
if (argc < 2 || argv[1][0] == '-') {
88-
printf("usage: %s <model> <n_ctx> <n_gens> <temp> [prompt]\n", argv[0]);
89-
printf(" note: passing a temp parameter will enable GPU sampling\n");
90-
return 1 ;
91-
}
99+
fprintf( stderr, "\n\n" );
92100

93-
std::string model = argv[1];
94-
struct llama_context_params lparams = llama_context_default_params();
101+
// Print the tokens from the prompt :
95102

96-
if (argc >= 3) {
97-
lparams.n_ctx = std::stoi(argv[2]);
98-
} else {
99-
lparams.n_ctx = 512;
103+
for( auto id : tokens_list )
104+
{
105+
printf( "%s" , llama_token_to_str( ctx , id ) );
100106
}
101107

102-
int n_gens;
103-
if (argc >= 4) {
104-
n_gens = std::stoi(argv[3]);
105-
} else {
106-
n_gens = 1;
107-
}
108+
fflush(stdout);
108109

109-
float temperature;
110110

111-
if (argc >= 5) {
112-
temperature = std::stof(argv[4]);
113-
} else {
114-
temperature = 0.8f;
115-
}
111+
//---------------------------------
112+
// Main prediction loop :
113+
//---------------------------------
116114

117-
std::string prompt;
118-
if (argc >= 6) {
119-
prompt = argv[5];
120-
} else {
121-
prompt = "Hello my name is";
122-
}
115+
// The LLM keeps a contextual cache memory of previous token evaluation.
116+
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
117+
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
118+
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
123119

124-
// initialize llama.cpp
125-
bool numa = false;
126-
llama_init_backend(numa);
120+
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
121+
{
122+
//---------------------------------
123+
// Evaluate the tokens :
124+
//---------------------------------
127125

128-
llama_model * lmodel = llama_load_model_from_file(model.c_str(), lparams);
129-
if (lmodel == NULL) {
130-
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, model.c_str());
131-
return 1;
132-
}
126+
if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
127+
{
128+
fprintf( stderr, "%s : failed to eval\n" , __func__ );
129+
return 1;
130+
}
133131

134-
llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
135-
if (ctx == NULL) {
136-
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, model.c_str());
137-
llama_free_model(lmodel);
138-
return 1;
139-
}
132+
tokens_list.clear();
140133

141-
// tokenize the prompt
142-
std::vector<llama_token> token_list(lparams.n_ctx);
143-
int prompt_tokens = llama_tokenize(ctx, prompt.c_str(), token_list.data(), token_list.size(), true);
144-
if (prompt_tokens <= 0) {
145-
fprintf(stderr, "%s: error: unable to tokenize prompt\n", __func__);
146-
return 1;
147-
}
134+
//---------------------------------
135+
// Select the best prediction :
136+
//---------------------------------
148137

149-
token_list.resize(prompt_tokens);
138+
llama_token new_token_id = 0;
150139

151-
const int max_context_size = llama_n_ctx(ctx);
152-
const int max_tokens_list_size = max_context_size - 4 ;
140+
auto logits = llama_get_logits( ctx );
141+
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
153142

154-
if ((int)token_list.size() > max_tokens_list_size) {
155-
fprintf( stderr, "%s: error: prompt too long (%d tokens, max %d)\n" ,
156-
__func__, (int)token_list.size(), max_tokens_list_size );
157-
return 1;
158-
}
143+
std::vector<llama_token_data> candidates;
144+
candidates.reserve( n_vocab );
159145

160-
fprintf(stderr, "\n\n");
146+
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
147+
{
148+
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
149+
}
161150

162-
// generate the sequences
163-
for (int i = 0; i < n_gens; i++) {
164-
printf("==== GENERATION %d ====\n", i + 1);
165-
generate_sequence(ctx, max_context_size, token_list, temperature);
166-
printf("\n\n");
167-
}
151+
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
152+
153+
// Select it using the "Greedy sampling" method :
154+
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
155+
156+
157+
// is it an end of stream ?
158+
if ( new_token_id == llama_token_eos() )
159+
{
160+
fprintf(stderr, " [end of text]\n");
161+
break;
162+
}
163+
164+
// Print the new token :
165+
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
166+
fflush( stdout );
167+
168+
// Push this new token for next evaluation :
169+
tokens_list.push_back( new_token_id );
168170

169-
llama_print_timings(ctx);
170-
llama_free(ctx);
171+
} // wend of main loop
172+
173+
llama_free( ctx );
174+
llama_free_model( model );
175+
176+
llama_backend_free();
171177

172178
return 0;
173179
}
180+
181+
// EOF

0 commit comments

Comments
 (0)