diff --git a/.gitignore b/.gitignore index 2c67ad7f7c609..41fe1f31271d2 100644 --- a/.gitignore +++ b/.gitignore @@ -107,6 +107,7 @@ examples/server/*.gz.hpp !examples/*/*/*.kts !examples/sycl/*.bat !examples/sycl/*.sh +/*.wav # Server Web UI temporary files node_modules diff --git a/common/common.cpp b/common/common.cpp index 94f545f815c27..b5668ddfdb2c9 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1565,3 +1565,31 @@ common_control_vector_data common_control_vector_load(const std::vector & data, int sample_rate) { + std::ofstream file(fname, std::ios::binary); + if (!file) { + LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str()); + return false; + } + + wav_header header; + header.sample_rate = sample_rate; + header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); + header.block_align = header.num_channels * (header.bits_per_sample / 8); + header.data_size = data.size() * (header.bits_per_sample / 8); + header.chunk_size = 36 + header.data_size; + + file.write(reinterpret_cast(&header), sizeof(header)); + + for (const auto & sample : data) { + int16_t pcm_sample = static_cast(std::clamp(sample * 32767.0, -32768.0, 32767.0)); + file.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); + } + + return file.good(); +} diff --git a/common/common.h b/common/common.h index e6eaa8e80cf05..9012e657fbefc 100644 --- a/common/common.h +++ b/common/common.h @@ -662,3 +662,25 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count"; const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; } + +// +// Audio utils +// + +struct wav_header { + char riff[4] = {'R', 'I', 'F', 'F'}; + uint32_t chunk_size; + char wave[4] = {'W', 'A', 'V', 'E'}; + char fmt[4] = {'f', 'm', 't', ' '}; + uint32_t fmt_chunk_size = 16; + uint16_t audio_format = 1; // PCM + uint16_t num_channels = 1; // Mono + uint32_t sample_rate; + uint32_t byte_rate; + uint16_t block_align; + uint16_t bits_per_sample = 16; + char data[4] = {'d', 'a', 't', 'a'}; + uint32_t data_size; +}; + +bool save_wav16(const std::string & fname, const std::vector & data, int sample_rate); diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt index c72bd814c3b31..e66c298db461a 100644 --- a/examples/tts/CMakeLists.txt +++ b/examples/tts/CMakeLists.txt @@ -3,3 +3,20 @@ add_executable(${TARGET} tts.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +add_library(mimi-model STATIC mimi-model.h mimi-model.cpp) +target_link_libraries(mimi-model PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +# for using C++ designated initializers, TODO: can be changed back to C++17 in the future +target_compile_features(mimi-model PRIVATE cxx_std_20) + +set(TARGET llama-mimi) +add_executable(${TARGET} mimi.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama common mimi-model ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +set(TARGET llama-tts-csm) +add_executable(${TARGET} tts-csm.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama common mimi-model ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/tts/README-csm.md b/examples/tts/README-csm.md new file mode 100644 index 0000000000000..676b9889e157d --- /dev/null +++ b/examples/tts/README-csm.md @@ -0,0 +1,47 @@ +# Sesame CSM + +This demo shows running inference of [Sesame CSM](https://github.com/SesameAILabs/csm) using llama.cpp / GGML + +It contains 3 components (each has its own GGUF file): +1. Backbone LLM +2. Decoder LLM +3. Mimi decoder + +## Quick start + +By default, all GGUF files are downloaded from [ggml-org Hugging Face's account](https://huggingface.co/ggml-org/sesame-csm-1b-GGUF) + +```sh +# build (make sure to have LLAMA_CURL enabled) +cmake -B build -DLLAMA_CURL=ON +cmake --build build -j --target llama-tts-csm + +# run it +./build/bin/llama-tts-csm -p "[0]Hi, my name is Xuan Son. I am software engineer at Hugging Face." +``` + +## Convert the model yourself + +To get the GGUF: + +```sh +python examples/tts/convert_csm_to_gguf.py + +# default output files: +# sesame-csm-backbone.gguf +# sesame-csm-decoder.gguf + +# optionally, quantize it +# (lowest scheme is q8_0, it does not make sense to quantize further, quality degrades too much) +python examples/tts/convert_csm_to_gguf.py --outtype q8_0 +``` + +Run the example using local file: + +```sh +./build/bin/llama-tts-csm -m sesame-csm-backbone.gguf -mv kyutai-mimi.gguf -p "[0]Hello world." +# sesame-csm-backbone.gguf will automatically be loaded +# make sure the place these 2 GGUF files in the same directory + +# output file: output.wav +``` diff --git a/examples/tts/README-mimi.md b/examples/tts/README-mimi.md new file mode 100644 index 0000000000000..6576a118291ad --- /dev/null +++ b/examples/tts/README-mimi.md @@ -0,0 +1,50 @@ +# llama.cpp/example/mimi + +This demonstrates running [Kyutai's Mimi](https://huggingface.co/kyutai/mimi) model via GGML. + +## Quickstart + +Convert model to GGUF (no need to download, the script will automatically download the `safetensors` file) + +```sh +python examples/tts/convert_mimi_to_gguf.py + +# output file: kyutai-mimi.gguf + +# optionally, use q8_0 quantization for faster speed +python examples/tts/convert_mimi_to_gguf.py --outtype q8_0 +``` + +Then compile, run it: + +```sh +cmake --build build -j --target llama-mimi + +./build/bin/llama-mimi kyutai-mimi.gguf codes.txt + +# output: output.wav + +# alternatively, use "dummy1" to get a "wah hello there" sample output file +./build/bin/llama-mimi kyutai-mimi.gguf dummy1 +``` + +Example of code file (one code per line): + +``` +1263 +1597 +1596 +1477 +1540 +1720 +1433 +118 +1066 +1968 +1096 +232 +418 +566 +1653 +2010 +``` diff --git a/examples/tts/convert_csm_to_gguf.py b/examples/tts/convert_csm_to_gguf.py new file mode 100644 index 0000000000000..53f586f19962d --- /dev/null +++ b/examples/tts/convert_csm_to_gguf.py @@ -0,0 +1,328 @@ +import os +import sys +import argparse +import logging +import torch +from safetensors.torch import load_file +from typing import Union, Any, Dict +from pathlib import Path +from torch import Tensor +from huggingface_hub import hf_hub_download + +cur_path = sys.path +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent.parent.parent / 'gguf-py')) +import gguf + +sys.path = cur_path + +logger = logging.getLogger("csm") + + +# This converts directly one safetensors file to 2 GGUFs +# It is easier to do this way, rather than convert to 2 smaller HF models and then convert to GGUF +# This is because the Sesame model does not have built-in tokenizer + +def get_field_data(reader: gguf.GGUFReader, key: str) -> Any: + field = reader.get_field(key) + return field.contents() if field else None + +# copied from https://github.com/SesameAILabs/csm/blob/main/models.py +class Llama_3_2_1B: + vocab_size=128_256 + num_layers=16 + num_heads=32 + num_kv_heads=8 + embed_dim=2048 + max_seq_len=2048 + intermediate_dim=8192 + attn_dropout=0.0 + norm_eps=1e-5 + rope_base=500_000 + scale_factor=32 + + def write_gguf_metadata(self, fout: gguf.GGUFWriter, fvocab: gguf.GGUFReader): + arch = get_field_data(fvocab, gguf.Keys.General.ARCHITECTURE) + assert arch == "llama" + fout.add_type("model") + fout.add_block_count(self.num_layers) + fout.add_context_length(self.max_seq_len) + fout.add_feed_forward_length(self.intermediate_dim) + fout.add_embedding_length(self.embed_dim) + # attn + fout.add_head_count(self.num_heads) + fout.add_head_count_kv(self.num_kv_heads) + fout.add_rope_freq_base(self.rope_base) + # fout.add_rope_scaling_factor(self.scale_factor) # breaks if this is added + fout.add_rope_dimension_count(self.embed_dim // self.num_heads) + fout.add_layer_norm_rms_eps(self.norm_eps) + fout.add_key_length(self.embed_dim // self.num_heads) + fout.add_value_length(self.embed_dim // self.num_heads) + # vocab + fout.add_vocab_size(self.vocab_size) + fout.add_tokenizer_model(get_field_data(fvocab, gguf.Keys.Tokenizer.MODEL)) + fout.add_tokenizer_pre(get_field_data(fvocab, gguf.Keys.Tokenizer.PRE)) + fout.add_token_list(get_field_data(fvocab, gguf.Keys.Tokenizer.LIST)[:self.vocab_size]) + fout.add_token_types(get_field_data(fvocab, gguf.Keys.Tokenizer.TOKEN_TYPE)[:self.vocab_size]) + fout.add_token_merges(get_field_data(fvocab, gguf.Keys.Tokenizer.MERGES)) + fout.add_bos_token_id(get_field_data(fvocab, gguf.Keys.Tokenizer.BOS_ID)) + fout.add_eos_token_id(get_field_data(fvocab, gguf.Keys.Tokenizer.EOS_ID)) + +class Llama_3_2_100M(Llama_3_2_1B): + vocab_size=65_632 #128_256 + num_layers=4 + num_heads=8 + num_kv_heads=2 + embed_dim=1024 + max_seq_len=2048 + intermediate_dim=8192 + attn_dropout=0.0 + norm_eps=1e-5 + rope_base=500_000 + scale_factor=32 + +class CSMModelConverter: + state_dict: Dict[str, Tensor] + gguf_writer_backbone: gguf.GGUFWriter + gguf_writer_decoder: gguf.GGUFWriter + gguf_reader_vocab: gguf.GGUFReader + fname_out: Path + ftype: gguf.LlamaFileType + + def __init__(self, + safetensors_path: Union[Path, str], + path_to_vocab_gguf: Path, + fname_out: Path, + ftype: gguf.LlamaFileType, + is_big_endian: bool,): + + if "" not in fname_out.name: + raise ValueError("Output file name must contain '' placeholder, for example: 'sesame-csm-.gguf'") + + self.state_dict = load_file(safetensors_path, device="cpu") + self.fname_out = fname_out + self.ftype = ftype + self.gguf_reader_vocab = gguf.GGUFReader(path_to_vocab_gguf) + endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + + # backbone + self.gguf_writer_backbone = gguf.GGUFWriter( + path=None, + arch="llama-csm", + endianess=endianess) + + # decoder + self.gguf_writer_decoder = gguf.GGUFWriter( + path=None, + arch="llama-csm", + endianess=endianess) + + Llama_3_2_1B().write_gguf_metadata(self.gguf_writer_backbone, self.gguf_reader_vocab) + Llama_3_2_100M().write_gguf_metadata(self.gguf_writer_decoder, self.gguf_reader_vocab) + + # load tensors + for component in ("backbone", "decoder"): + print() + print(f"Converting {component}...") + print() + for name, data_torch in self.state_dict.items(): + # convert any unsupported data types to float32 + old_dtype = data_torch.dtype + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + self.add_tensor(name, data_torch, old_dtype, component) + + def add_tensor(self, name: str, data_torch: Tensor, old_dtype: torch.dtype, component: str): + is_1d = len(data_torch.shape) == 1 + #is_embd = "_embeddings" in name + can_quantize = not is_1d #and not is_embd + data_qtype = gguf.GGMLQuantizationType.F32 + + is_backbone = False + is_decoder = False + + def rename_transformer(name: str) -> str: + # transformer + name = name.replace(".scale", ".weight") + name = name.replace("attn.k_proj", "attn_k") + name = name.replace("attn.q_proj", "attn_q") + name = name.replace("attn.v_proj", "attn_v") + name = name.replace("attn.output_proj", "attn_output") + name = name.replace("sa_norm", "attn_norm") + name = name.replace("mlp.w1", "ffn_gate") + name = name.replace("mlp.w2", "ffn_down") + name = name.replace("mlp.w3", "ffn_up") + name = name.replace("mlp_norm", "ffn_norm") + return name + + if "audio_embeddings." in name: + is_decoder = True + name = name.replace("audio_embeddings.", "audio_embd.") + + elif "text_embeddings." in name: + is_backbone = True + name = name.replace("text_embeddings.", "token_embd.") + + elif "backbone." in name or "codebook0_head." in name: + is_backbone = True + name = name.replace("backbone.layers.", "blk.") + name = name.replace("backbone.norm.scale", "output_norm.weight") + name = rename_transformer(name) + + elif "decoder." in name: + is_decoder = True + name = name.replace("decoder.layers.", "blk.") + name = name.replace("decoder.norm.scale", "output_norm.weight") + name = rename_transformer(name) + + elif name == "audio_head": + is_decoder = True + name = "audio_head.weight" + if component == "decoder": + # add padding at the beginning and the end so that build_lora_mm_id can be used + zero_tensor = torch.zeros(1, 1024, 2051) + data_torch = torch.cat([zero_tensor, data_torch, zero_tensor], dim=0) + assert data_torch.shape == (33, 1024, 2051) + # then, transpose it + data_torch = data_torch.transpose(1, 2) + + elif name == "projection.weight": + is_decoder = True + is_backbone = True + name = "csm_proj.weight" + + if can_quantize: + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + # decoder is very sensitive to quantization, do not quantize it lower than F16 + data_qtype = gguf.GGMLQuantizationType.Q8_0 if component != "decoder" \ + else gguf.GGMLQuantizationType.F16 + else: + raise ValueError(f"Unsupported file type: {self.ftype}") + + data = data_torch.numpy() + + try: + data = gguf.quants.quantize(data, data_qtype) + except Exception as e: + logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + if (is_backbone and component == "backbone") or (is_decoder and component == "decoder"): + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" + logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + if component == "backbone": + self.gguf_writer_backbone.add_tensor(name, data, raw_dtype=data_qtype) + elif component == "decoder": + self.gguf_writer_decoder.add_tensor(name, data, raw_dtype=data_qtype) + + def write(self): + self._write_single(self.gguf_writer_backbone, "backbone") + self._write_single(self.gguf_writer_decoder, "decoder") + + def _write_single(self, gguf_writer: gguf.GGUFWriter, component: str): + output_path = str(self.fname_out).replace("", component) + gguf_writer.write_header_to_file(path=Path(output_path)) + gguf_writer.write_kv_data_to_file() + gguf_writer.write_tensors_to_file(progress=True) + gguf_writer.close() + + @staticmethod + def undo_permute(weights: Tensor, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert Sesame model to GGUFs (multiple files)",) + parser.add_argument( + "--outfile", type=Path, default="sesame-csm-.gguf", + help="path to write to, the '' placeholder is required and will be replaced with 'backbone' and 'decoder'", + ) + parser.add_argument( + "--vocab", type=Path, default="models/ggml-vocab-llama-bpe.gguf", + help="path to vocab GGUF", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", + help="output format", + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", type=Path, + help="path to safetensors or model ID containing model file (if model ID is specified, download from Hugging Face hub)", + nargs="?", + default="sesame/csm-1b:model.safetensors", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + + args = parser.parse_args() + if args.model is None: + parser.error("the following arguments are required: model") + return args + + +def main() -> None: + args = parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + dir_model = args.model + path_vocab = args.vocab + + dir_parts = str(dir_model).split(":") + if len(dir_parts) == 2: + try: + dir_model = Path(hf_hub_download(dir_parts[0], dir_parts[1])) + except Exception as e: + print("Error downloading model from Hugging Face hub:", e) + print() + print("Please make sure you have access to the model") + print("Hint: you may need to set HF_TOKEN by running: huggingface-cli login") + + if not path_vocab.exists(): + raise FileNotFoundError(f"Vocab file not found: {path_vocab} ; Hint: download it from https://github.com/ggml-org/llama.cpp/blob/master/models/ggml-vocab-llama-bpe.gguf") + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + } + + logger.info(f"Loading model: {dir_model}") + + with torch.inference_mode(): + converter = CSMModelConverter( + safetensors_path=dir_model, + fname_out=args.outfile, + path_to_vocab_gguf=path_vocab, + ftype=ftype_map[args.outtype], + is_big_endian=args.bigendian, + ) + converter.write() + + +if __name__ == '__main__': + main() + diff --git a/examples/tts/convert_mimi_to_gguf.py b/examples/tts/convert_mimi_to_gguf.py new file mode 100644 index 0000000000000..81cb8f48cc25e --- /dev/null +++ b/examples/tts/convert_mimi_to_gguf.py @@ -0,0 +1,191 @@ +import gguf +import argparse +import logging +import torch +from typing import Union +from pathlib import Path +from torch import Tensor +from transformers import MimiModel, PreTrainedModel + +logger = logging.getLogger("mimi") + + +class MimiModelConverter: + mimi_model: PreTrainedModel + gguf_writer: gguf.GGUFWriter + fname_out: Path + ftype: gguf.LlamaFileType + + def __init__(self, + pretrained_model_name_or_path: Union[Path, str], + fname_out: Path, + ftype: gguf.LlamaFileType, + is_big_endian: bool,): + self.mimi_model = MimiModel.from_pretrained(pretrained_model_name_or_path) + self.fname_out = fname_out + self.ftype = ftype + endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.gguf_writer = gguf.GGUFWriter( + path=None, + arch="this model cannot be used as LLM, use it via --model-vocoder in TTS examples", + endianess=endianess) + + assert self.mimi_model.config.architectures[0] == "MimiModel" + + # load tensors + for name, data_torch in self.mimi_model.state_dict().items(): + # convert any unsupported data types to float32 + old_dtype = data_torch.dtype + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + self.add_tensor(name, data_torch, old_dtype) + + def add_tensor(self, name: str, data_torch: Tensor, old_dtype: torch.dtype): + is_1d = len(data_torch.shape) == 1 + is_bias = ".bias" in name + can_quantize = not is_1d and not is_bias + data_qtype = gguf.GGMLQuantizationType.F32 + + n_head = self.mimi_model.config.num_attention_heads + n_kv_head = self.mimi_model.config.num_key_value_heads + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = self.undo_permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = self.undo_permute(data_torch, n_head, n_kv_head) + + # process codebook + if ".codebook.initialized" in name: + # "initialized" tensor + state_dict = self.mimi_model.state_dict() + embed_sum = state_dict[name.replace(".initialized", ".embed_sum")] + cluster_usage = state_dict[name.replace(".initialized", ".cluster_usage")] + # see modeling_mimi.py --> MimiEuclideanCodebook + data_torch = embed_sum / cluster_usage.clamp(min=self.mimi_model.config.norm_eps)[:, None] + name = name.replace(".initialized", "") + + # ignore processed tensors + if ".cluster_usage" in name or ".embed_sum" in name: + return + + # transpose some tensors + if ".conv.bias" in name: + data_torch = data_torch.view((1, data_torch.shape[0])) + data_torch = data_torch.transpose(0, 1) + + # change view 3d to 2d + if "quantizer" in name and "_proj." in name: + assert data_torch.shape[2] == 1 + data_torch = data_torch.view((data_torch.shape[0], data_torch.shape[1])) + + # shorten name, otherwise it will be too long for ggml to read + name = name.replace("_residual_vector_quantizer", "_rvq") + + if can_quantize: + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + else: + raise ValueError(f"Unsupported file type: {self.ftype}") + + # Conv kernels are always F16 + if ".conv.weight" in name: + data_qtype = gguf.GGMLQuantizationType.F16 + + data = data_torch.numpy() + + try: + data = gguf.quants.quantize(data, data_qtype) + except Exception as e: + logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" + logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) + + def write(self): + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + + @staticmethod + def undo_permute(weights: Tensor, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert Mimi safetensors model to GGUF",) + parser.add_argument( + "--outfile", type=Path, default="kyutai-mimi.gguf", + help="path to write to", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", + help="output format", + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", type=Path, + help="directory or model ID containing model file (if model ID is specified, download from Hugging Face hub)", + nargs="?", + default="kyutai/mimi", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + + args = parser.parse_args() + if args.model is None: + parser.error("the following arguments are required: model") + return args + + +def main() -> None: + args = parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + dir_model = args.model + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + } + + logger.info(f"Loading model: {dir_model}") + + with torch.inference_mode(): + converter = MimiModelConverter( + pretrained_model_name_or_path=dir_model, + fname_out=args.outfile, + ftype=ftype_map[args.outtype], + is_big_endian=args.bigendian, + ) + converter.write() + + +if __name__ == '__main__': + main() + diff --git a/examples/tts/csm-demo.txt b/examples/tts/csm-demo.txt new file mode 100644 index 0000000000000..1c913388bfb3d --- /dev/null +++ b/examples/tts/csm-demo.txt @@ -0,0 +1,5 @@ +[0]Hey how are you doing. +[1]Pretty good, pretty good. +[0]I'm great, so happy to be speaking to you. +What about you? +[1]Me too, this is some cool stuff huh? diff --git a/examples/tts/csm_generate_speaker.py b/examples/tts/csm_generate_speaker.py new file mode 100644 index 0000000000000..a06dee6846eac --- /dev/null +++ b/examples/tts/csm_generate_speaker.py @@ -0,0 +1,80 @@ +import argparse +from pathlib import Path +from transformers import MimiModel, AutoFeatureExtractor +from transformers.models.mimi.modeling_mimi import MimiEncoderOutput + +# pyright: reportMissingImports=false +from scipy.io.wavfile import read +from scipy.signal import resample +import numpy as np + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate speaker reference file, used by llama-tts-csm example",) + parser.add_argument( + "--model-path", type=Path, + help="custom Mimi model path (safetensors model). If not specified, will use the default model from Hugging Face hub", + ) + parser.add_argument( + "infile", type=Path, + help="the wav input file to use for generating the speaker reference file", + nargs="?", + ) + # parser.add_argument( + # "outfile", type=Path, + # help="the output file, defaults to the input file with .codes suffix", + # nargs="?", + # ) + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + if args.infile is None: + raise ValueError("Input file is required") + + if not args.infile.exists(): + raise FileNotFoundError(f"Input file {args.infile} not found") + + # if args.outfile is None: + # args.outfile = args.infile.with_suffix(".codes") + + model = MimiModel.from_pretrained(args.model_path or "kyutai/mimi") + feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_path or "kyutai/mimi") + + inp_audio = read(args.infile) + original_sample_rate = inp_audio[0] + audio_data = inp_audio[1] + + # If stereo, get only the first channel + if len(audio_data.shape) > 1 and audio_data.shape[1] >= 2: + audio_data = audio_data[:, 0] + + # resample + target_sample_rate = 24000 + number_of_samples = round(len(audio_data) * float(target_sample_rate) / original_sample_rate) + resampled_audio = resample(audio_data, number_of_samples) + resampled_audio = resampled_audio / max(np.max(np.abs(resampled_audio)), 1e-10) + + # pre-process the inputs + audio_sample = np.array(resampled_audio, dtype=float) + inputs = feature_extractor(raw_audio=audio_sample, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt") + print('inputs', inputs["input_values"], inputs["input_values"].shape) + + # encode + encoder_outputs = model.encode(inputs["input_values"]) + assert isinstance(encoder_outputs, MimiEncoderOutput), "encoder_outputs should be of type MimiEncoderOutput" + + # output + flattened_audio_codes = encoder_outputs.audio_codes.transpose(-1, -2).flatten() + for i in range(0, len(flattened_audio_codes), 16): + for code in flattened_audio_codes[i:i+16].tolist(): + print(f"{code:<5}", end=",") + print() + + +if __name__ == '__main__': + main() diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp new file mode 100644 index 0000000000000..fee88c679e1f3 --- /dev/null +++ b/examples/tts/mimi-model.cpp @@ -0,0 +1,734 @@ +#include "ggml.h" +#include "ggml-cpp.h" +#include "ggml-cpu.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "gguf.h" + +#include "common.h" +#include "mimi-model.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Implementation of Kyutai's Mimi model using GGML. + * Based on this research: https://github.com/ngxson/ggml-easy/blob/master/demo/kyutai-mimi.cpp + * + * NOTE: only decoder is working for now. + * + * Background: + * - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc + * - Audio codes must be in the order: N semantic codes followed by (N*31) acoustic codes + * (In other words, input matrix has shape 32 cols x N rows) + * + * How it works? + * 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code + * 2. The latent code is passed to a mimi_conv_transpose_1d (depthwise) to upscale + * 3. The upscaled code is passed to transformer, it converts N frames to N frames + * 4. The output embeddings is then passed to SEANet (mimi_encoder_decoder) to get the final waveform + * 5. Waveform is written to a file + */ + +// copied from https://huggingface.co/kyutai/mimi/blob/main/config.json +struct mimi_config_t { + bool causal = true; + int sample_rate = 24000; + int max_position_embeddings = 8000; + int num_hidden_layers = 8; + int n_embd = 512; + int n_ffn = 2048; + int n_head = 8; + int n_head_kv = 8; + int n_rot = 64; + float norm_eps = 1e-5; + float rope_theta = 10000.0f; + int sliding_window = 250; + std::array upsampling_ratio = {8, 6, 5, 4}; + std::array downsampling_ratio = {4, 5, 6, 8}; // reverse of upsampling_ratio + // vector quantizer + float frame_rate = 12.5; + int audio_channels = 1; + int codebook_size = 2048; + int codebook_dim = 256; + int n_semantic_components = 1; + int n_acoustic_components = 31; + // decode + float trim_right_ratio = 1.0f; + int n_codes_per_frame = (sliding_window / 2) * (n_semantic_components + n_acoustic_components); +} mimi_config; + +// Adapted from https://github.com/ngxson/ggml-easy/blob/master/ggml-easy.h +struct mimi_ggml_ctx { + gguf_context * ctx_gguf = nullptr; + ggml_context * ctx_data = nullptr; + ggml_context * ctx_gf = nullptr; + + // CPU-only for now, as many kernels are missing and we actually get less performance with GPU + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buf = nullptr; + ggml_backend_sched_ptr sched; + + ggml_cgraph * gf = nullptr; + std::vector buf_compute_meta; + int max_nodes = 16 * 1024; + + std::unordered_map tensors; + + mimi_ggml_ctx() { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + auto buft = ggml_backend_get_default_buffer_type(backend); + sched.reset( + ggml_backend_sched_new(&backend, &buft, 1, max_nodes, false) + ); + buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); + } + + void load_gguf(const char * fname) { + ggml_context * meta = nullptr; + + gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, + }; + + ctx_gguf = gguf_init_from_file(fname, params); + + // load tensors + const int n_tensors = gguf_get_n_tensors(ctx_gguf); + + std::vector read_buf; + ggml_init_params ggml_params = { + /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ctx_data = ggml_init(ggml_params); + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + ggml_free(meta); + throw std::runtime_error("cannot open model file for loading tensors"); + } + + // add tensors to context + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + ggml_tensor * t = ggml_get_tensor(meta, name); + ggml_tensor * cur = ggml_dup_tensor(ctx_data, t); + ggml_set_name(cur, name); + tensors.insert({name, cur}); + } + + // alloc memory and offload data + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft); + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + const size_t offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i); + // printf("%s: Loading tensor \"%s\"\n", __func__, name); + fin.seekg(offset, std::ios::beg); + if (!fin) { + ggml_free(meta); + throw std::runtime_error(string_format("failed to seek for tensor: %s", name)); + } + int num_bytes = ggml_nbytes(cur); + if (ggml_backend_buft_is_host(buft)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + } + printf("%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname); + fin.close(); + + ggml_free(meta); + } + + /** + * Build a cgraph using the given builder function. + * + * The built cgraph will be stored in `ctx.gf` + */ + void build_graph(std::function builder_fn) { + ggml_free(ctx_gf); + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ctx_gf = ggml_init(params); + ggml_backend_sched_reset(sched.get()); + gf = ggml_new_graph_custom(ctx_gf, max_nodes, false); + + builder_fn(ctx_gf, gf); + ggml_backend_sched_alloc_graph(sched.get(), gf); + } + + ggml_status compute() { + ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf); + return status; + } + + void set_tensor_data(const std::string & name, const void * data) { + ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); + if (!t) { + throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); + } + ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t)); + } + + std::pair> get_tensor_data(const std::string & name) { + ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); + if (!t) { + throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); + } + std::vector data(ggml_nbytes(t)); + ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); + return std::make_pair(t, data); + } + + ggml_tensor * get_weight(const char *fmt, ...) { + std::vector str(128); + va_list va; + va_start(va, fmt); + vsnprintf(str.data(), 128, fmt, va); + va_end(va); + auto it = tensors.find(str.data()); + if (it == tensors.end()) { + throw std::runtime_error(string_format("weight tensor not found: %s", str.data())); + } + return it->second; + } + + ~mimi_ggml_ctx() { + ggml_free(ctx_data); + gguf_free(ctx_gguf); + ggml_backend_buffer_free(buf); + } +}; + +/////////////////////////////////////////////////////////////////////////// +// extension to ggml.h +// TODO: add these ops to the library (ofc with a more optimized kernel) + + +// mode: (0) constant, (1) reflect, (2) replicate, (3) circular +// value is only used in "constant" +// only "constant" with 0.0f and "replicate" are implemented here +static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode, + int64_t pad_left, int64_t pad_right, float value = 0.0f) { + GGML_ASSERT(value == 0.0f); // we can technically use ggml_arange, but for simplication we only support 0.0f + GGML_ASSERT(mode == 0 || mode == 2); + if (pad_left > 0) { + ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]); + if (mode == 0) { + tmp = ggml_scale(ctx0, tmp, value); + } else if (mode == 2) { + ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0); // get first column + tmp = ggml_repeat(ctx0, elem, tmp); + } + x = ggml_concat(ctx0, tmp, x, 0); + } + if (pad_right > 0) { + ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]); + if (mode == 0) { + tmp = ggml_scale(ctx0, tmp, value); + } else if (mode == 2) { + int64_t last = x->ne[0] - 1; + ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x)); // get last column + tmp = ggml_repeat(ctx0, elem, tmp); + } + x = ggml_concat(ctx0, x, tmp, 0); + } + return x; +} + + + + +/////////////////////////////////////////////////////////////////////////// +// MimiConv and MimiConvTranspose + +static int64_t div_ceil(int64_t a, int64_t b) { + return a / b + (a % b ? 1 : 0); +} + +static ggml_tensor * mimi_conv_1d(ggml_context * ctx0, ggml_tensor * x, + ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) { + int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1; + int64_t p_total = kernel_size - stride; // padding total + int64_t p_half = p_total / 2; + + int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride); + int64_t ideal_len = n_frames * stride + kernel_size - p_total; + int64_t p_extra = ideal_len - x->ne[0]; + + int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra; + int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half); + + x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right); + + x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation); + if (bias) { + x = ggml_add(ctx0, x, bias); + } + ggml_set_name(x, "mimi_conv_1d"); + return x; +} + +static ggml_tensor * mimi_conv_transpose_1d(ggml_context * ctx0, ggml_tensor * x, + ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) { + GGML_ASSERT(x->ne[1] == kernel->ne[2]); + int64_t n_rows = x->ne[1]; + int64_t kernel_size = kernel->ne[0]; + int64_t p_total = kernel_size - stride; // padding total + + int64_t p_right = mimi_config.causal + ? (float)p_total / mimi_config.trim_right_ratio + : p_total / 2; + int64_t p_left = p_total - p_right; + + ggml_tensor * out = nullptr; + + if (depthwise) { + for (int64_t ir = 0; ir < n_rows; ir++) { + ggml_tensor * row = ggml_view_1d(ctx0, x, + x->ne[0], ir*x->ne[0]*ggml_element_size(x)); + ggml_tensor * krn = ggml_view_1d(ctx0, kernel, + kernel->ne[0], ir*kernel->ne[0]*ggml_element_size(kernel)); + row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation); + // unpad (remove p_right and p_left columns) + row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row)); + + // TODO: concat can be slow, we should use ggml_view_1d/ggml_cpy to avoid realloc + out = out ? ggml_concat(ctx0, out, row, 1) : row; + } + + } else { + out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation); + // unpad + out = ggml_view_2d(ctx0, out, + out->ne[0] - p_total, out->ne[1], + out->nb[1], p_left*ggml_element_size(out)); + } + + if (bias) { + out = ggml_add(ctx0, out, bias); + } + + return out; +} + + + +/////////////////////////////////////////////////////////////////////////// + +// based on MimiEncoder +// SEANet encoder as used by Mimi. +struct mimi_encoder_decoder { + mimi_ggml_ctx & ctx; + struct layer { + bool is_elu = false; + bool is_resnet = false; + bool is_transposed_conv = false; + ggml_tensor * conv_0_w = nullptr; + ggml_tensor * conv_0_b = nullptr; + ggml_tensor * conv_1_w = nullptr; + ggml_tensor * conv_1_b = nullptr; + int stride = 1; + }; + std::vector layers; + + std::array repeated_pattern = {1, 4, 7, 10}; + + mimi_encoder_decoder(mimi_ggml_ctx & ctx): ctx(ctx) { + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"), + .conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"), + }); + for (int i = 0; i < (int)repeated_pattern.size(); ++i) { + int i_start = repeated_pattern[i]; + // upsampling layers + layers.push_back({ + .is_elu = true, // layer (i_start) + }); + layers.push_back({ + .is_transposed_conv = true, + .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1), + .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1), + .stride = mimi_config.upsampling_ratio[i], + }); + // residual layers + layers.push_back({ + .is_resnet = true, + .conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2), + .conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias", i_start + 2), + .conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2), + .conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias", i_start + 2), + }); + } + layers.push_back({ + .is_elu = true, // layer 13 + }); + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"), + .conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"), + }); + } + + ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input) { + ggml_tensor * x = input; + + for (auto & layer : layers) { + if (layer.is_elu) { + x = ggml_elu(ctx0, x); + } else if (layer.is_resnet) { + ggml_tensor * residual = x; + x = ggml_elu(ctx0, x); + x = mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1); + x = ggml_elu(ctx0, x); + x = mimi_conv_1d(ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1); + x = ggml_add(ctx0, x, residual); + } else { + x = layer.is_transposed_conv + ? mimi_conv_transpose_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false) + : mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1); + } + } + + return x; + } +}; + +struct mimi_transformer { + struct layer { + ggml_tensor * inp_norm_w = nullptr; + ggml_tensor * inp_norm_b = nullptr; + + ggml_tensor * attn_q = nullptr; + ggml_tensor * attn_k = nullptr; + ggml_tensor * attn_v = nullptr; + ggml_tensor * attn_o = nullptr; + ggml_tensor * attn_post_norm_w = nullptr; + ggml_tensor * attn_post_norm_b = nullptr; + ggml_tensor * attn_layer_scale = nullptr; + + ggml_tensor * ffn_up = nullptr; + ggml_tensor * ffn_down = nullptr; + ggml_tensor * mlp_layer_scale = nullptr; + }; + std::vector layers; + + mimi_transformer(mimi_ggml_ctx & ctx, const char * prefix, int n_layers) { + for (int il = 0; il < n_layers; il++) { + layers.push_back({ + .inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il), + .inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias", prefix, il), + + .attn_q = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight", prefix, il), + .attn_k = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight", prefix, il), + .attn_v = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight", prefix, il), + .attn_o = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight", prefix, il), + .attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il), + .attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias", prefix, il), + .attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale", prefix, il), + + .ffn_up = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight", prefix, il), + .ffn_down = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight", prefix, il), + .mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il), + }); + } + } + + ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * inp_pos) { + int n_tokens = input->ne[1]; + ggml_tensor * x = input; + + auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { + x = ggml_norm(ctx0, x, mimi_config.norm_eps); + x = ggml_mul(ctx0, x, w); + x = ggml_add(ctx0, x, b); + return x; + }; + + ggml_tensor * residual = input; + + for (auto & layer : layers) { + residual = x; + + // input layer norm + x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b); + + // self attention + { + ggml_tensor * q = ggml_mul_mat(ctx0, layer.attn_q, x); + ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x); + ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x); + + int n_embd_head = mimi_config.n_embd / mimi_config.n_head; + q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head, n_tokens); + k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens); + v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens); + + int n_rot = n_embd_head; + q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0); + q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3)); + + k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0); + k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3)); + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); // mimic behavior of llama.cpp + kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head)); + ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens); + kq = ggml_soft_max_inplace(ctx0, kq_masked); + + v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3)); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head); + kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens); + + x = ggml_mul_mat(ctx0, layer.attn_o, kqv); + } + + // residual + x = ggml_mul(ctx0, x, layer.attn_layer_scale); + x = ggml_add(ctx0, x, residual); + + residual = x; + x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b); + + // mlp + { + x = ggml_mul_mat(ctx0, layer.ffn_up, x); + x = ggml_gelu(ctx0, x); + x = ggml_mul_mat(ctx0, layer.ffn_down, x); + } + + // residual + x = ggml_mul(ctx0, x, layer.mlp_layer_scale); + x = ggml_add(ctx0, x, residual); + } + + return x; + } +}; + +struct mimi_residual_vector_quantizer { + struct component { + ggml_tensor * codebook; + }; + + ggml_tensor * semantic_inp_proj; + std::vector semantic_components; + ggml_tensor * semantic_out_proj; + + ggml_tensor * acoustic_inp_proj; + std::vector acoustic_components; + ggml_tensor * acoustic_out_proj; + + mimi_residual_vector_quantizer(mimi_ggml_ctx & ctx) { + semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight"); + semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight"); + for (int i = 0; i < mimi_config.n_semantic_components; i++) { + semantic_components.push_back({ + .codebook = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook", i), + }); + } + acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight"); + acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight"); + for (int i = 0; i < mimi_config.n_acoustic_components; i++) { + acoustic_components.push_back({ + .codebook = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook", i), + }); + } + } + + // the input has shape [n_codes, n_codes_per_embd] + // first row is semantic, the rest are acoustic + // example: [ [semantic], [acoustic1], [acoustic2], ... ] + ggml_tensor * decode(ggml_context * ctx0, ggml_tensor * input) { + GGML_ASSERT(input->type == GGML_TYPE_I32); + + size_t n_semantic = semantic_components.size(); + int64_t n_codes_per_embd = (n_semantic + acoustic_components.size()); + int64_t n_codes = input->ne[0] / n_codes_per_embd; + + GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0); + + ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); + ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); + out_s = ggml_scale(ctx0, out_s, 0.0f); // clear + out_a = ggml_scale(ctx0, out_a, 0.0f); // clear + + for (size_t ir = 0; ir < (size_t)n_codes_per_embd; ir++) { + ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, ir*n_codes*ggml_element_size(input)); + if (ir < n_semantic) { + // semantic + ggml_tensor * codebook = semantic_components[ir].codebook; + ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); + out_s = ggml_add(ctx0, out_s, embd); + } else { + // acoustic + ggml_tensor * codebook = acoustic_components[ir-n_semantic].codebook; + ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); + out_a = ggml_add(ctx0, out_a, embd); + } + } + + out_s = ggml_mul_mat(ctx0, semantic_out_proj, out_s); + out_a = ggml_mul_mat(ctx0, acoustic_out_proj, out_a); + + return ggml_add(ctx0, out_s, out_a); + } +}; + + +mimi_model::mimi_model(const char * fname, bool verbose) : verbose(verbose) { + ctx.reset(new mimi_ggml_ctx()); + ctx->load_gguf(fname); + + // initialize components + seanet_dec .reset(new mimi_encoder_decoder(*ctx)); + transformer_dec.reset(new mimi_transformer(*ctx, "decoder", mimi_config.num_hidden_layers)); + quantizer .reset(new mimi_residual_vector_quantizer(*ctx)); +} + +mimi_model::~mimi_model() { +} + +std::vector mimi_model::decode_frame(const std::vector & codes, int & n_past) { + // build cgraph + int n_pos = -1; + int n_codes = codes.size(); + int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; + GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd"); + + ctx->build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) { + ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes); + ggml_set_name(inp_dec, "inp_dec"); + ggml_set_input(inp_dec); + + // RVQ + ggml_tensor * embeddings = quantizer->decode(ctx_gf, inp_dec); + + // upsample + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + embeddings = mimi_conv_transpose_1d(ctx_gf, embeddings, ctx->get_weight("upsample.conv.weight"), nullptr, 2, 1, true); + + // transformer + n_pos = embeddings->ne[0]; + ggml_tensor * pos_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_dec, "pos_dec"); + ggml_set_input(pos_dec); + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + embeddings = transformer_dec->forward(ctx_gf, embeddings, pos_dec); + + // SEANET decoder + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + ggml_tensor * output = seanet_dec->forward(ctx_gf, embeddings); + + ggml_set_name(output, "output"); + ggml_set_output(output); + ggml_build_forward_expand(gf, output); + }); + + // position data + GGML_ASSERT(n_pos <= mimi_config.sliding_window); + std::vector pos_data(n_pos); + for (int i = 0; i < (int)pos_data.size(); i++) { + pos_data[i] = i + n_past; + } + if (verbose) { + printf("%s: n_pos: %d, n_past: %d\n", __func__, n_pos, n_past); + } + n_past += n_pos; + ctx->set_tensor_data("pos_dec", pos_data.data()); + + // code data + auto codes_T = mimi_model::transpose_input(codes); + ctx->set_tensor_data("inp_dec", codes_T.data()); + + ctx->compute(); + + auto output = ctx->get_tensor_data("output"); + // auto output_tensor = output.first; + auto output_data = output.second; + // printf("Output shape: [%lld, %lld]\n", output_tensor->ne[0], output_tensor->ne[1]); + + std::vector wav_data(output_data.size() / sizeof(float)); + for (size_t i = 0; i < wav_data.size(); i++) { + wav_data[i] = ((float *)output_data.data())[i]; + } + + return wav_data; +} + +std::vector mimi_model::decode(const std::vector & codes) { + std::vector output; + + if (verbose) { + printf("%s: n_codes: %zu\n", __func__, codes.size()); + } + + int64_t t_start = ggml_time_ms(); + int n_frames = 0; + + int n_past = 0; + for (size_t i = 0; i < codes.size(); i += mimi_config.n_codes_per_frame) { + size_t remaining = std::min((size_t)mimi_config.n_codes_per_frame, codes.size() - i); + std::vector frame(codes.begin() + i, codes.begin() + i + remaining); + + auto wav_data = decode_frame(frame, n_past); + output.insert(output.end(), wav_data.begin(), wav_data.end()); + + n_frames++; + } + + int64_t t_end = ggml_time_ms(); + if (verbose) { + printf("%s: n_frames: %d, time: %" PRId64 "ms, per_frame: %" PRId64 "ms\n", __func__, n_frames, t_end - t_start, (t_end - t_start) / n_frames); + } + + return output; +} + +std::vector mimi_model::transpose_input(const std::vector & codes) { + int n_codes = codes.size(); + int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; + GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd"); + + std::vector codes_T(n_codes); + for (int i = 0; i < n_codes / n_codes_per_embd; i++) { + for (int j = 0; j < n_codes_per_embd; j++) { + int src_idx = i * n_codes_per_embd + j; + int dst_idx = j * (n_codes / n_codes_per_embd) + i; + codes_T[dst_idx] = codes[src_idx]; + } + } + + return codes_T; +} + +int mimi_model::get_sample_rate() const { + return mimi_config.sample_rate; +} diff --git a/examples/tts/mimi-model.h b/examples/tts/mimi-model.h new file mode 100644 index 0000000000000..eb5eb46c22807 --- /dev/null +++ b/examples/tts/mimi-model.h @@ -0,0 +1,39 @@ +#pragma once + +#include "ggml.h" +#include +#include + +struct mimi_ggml_ctx; +struct mimi_encoder_decoder; +struct mimi_transformer; +struct mimi_residual_vector_quantizer; + +struct mimi_model { + bool verbose = false; + std::unique_ptr ctx; + + std::unique_ptr seanet_dec; + std::unique_ptr transformer_dec; + std::unique_ptr quantizer; + + mimi_model(const char * fname, bool verbose = false); + ~mimi_model(); + + int get_sample_rate() const; + + // layout of codes: (1 semantic code followed by 31 acoustic codes) repeast N times + std::vector decode(const std::vector & codes); + + // TODO: implement encoding pass + // std::vector encode(const std::vector & wav_data); + +private: + std::vector decode_frame(const std::vector & codes, int & n_past); + + // transpose layout (from streaming layout to non-streaming): + // - from: (1 semantic code followed by 31 acoustic codes) repeast N times + // - to: N semantic codes followed by (N*31) acoustic codes + // streaming layout is 1-31, 1-31, 1-31, ..., used for real-time processing + static std::vector transpose_input(const std::vector & codes); +}; diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp new file mode 100644 index 0000000000000..a50bd44f599a9 --- /dev/null +++ b/examples/tts/mimi.cpp @@ -0,0 +1,113 @@ +#include "common.h" +#include "mimi-model.h" + +#include +#include +#include // strcmp + + +/** + * This file is used for testing and showcase how to use "mimi_model" class. + * Please keep it simple and easy to understand. + */ + +int main(int argc, const char ** argv) { + if (argc < 3) { + fprintf(stderr, "Usage: %s model.gguf codes.txt [output.wav]\n", argv[0]); + fprintf(stderr, " Format of codes.txt file: one code per line\n"); + fprintf(stderr, " Replace codes.txt with dummy0 and dummy1 for testing\n"); + fprintf(stderr, " dummy0: using code 1, 2, 3,..., 96, used for logits matching\n"); + fprintf(stderr, " dummy1: using code that will outputs 'wah hello there' sound\n"); + return 1; + } + + const char * model_path = argv[1]; + const char * codes_path = argv[2]; + const char * out_path = argc < 4 ? "output.wav" : argv[3]; + + // load codes + std::vector codes; + if (strcmp(codes_path, "dummy0") == 0) { + printf("Using dummy0 codes\n"); + codes.resize(32 * 3); // [n_codes_per_embd = 32, n_codes = 3] + for (int i = 0; i < (int)codes.size(); i++) { + codes[i] = i; + } + } else if (strcmp(codes_path, "dummy1") == 0) { + printf("Using dummy1 codes\n"); + codes = { + 1049 ,1597 ,1325 ,839 ,592 ,1440 ,1341 ,985 ,1239 ,1146 ,1778 ,1636 ,1485 ,1622 ,757 ,480 , + 1899 ,1481 ,840 ,1397 ,82 ,1565 ,116 ,1449 ,1038 ,1015 ,436 ,150 ,159 ,1414 ,1740 ,1971 , + 1415 ,175 ,1539 ,776 ,1046 ,117 ,803 ,1499 ,1457 ,1307 ,2 ,1135 ,1287 ,1039 ,1124 ,716 , + 1798 ,201 ,1517 ,1299 ,886 ,1786 ,521 ,353 ,1912 ,1357 ,1311 ,450 ,297 ,971 ,1154 ,1729 , + 1962 ,1280 ,1943 ,878 ,1588 ,723 ,568 ,1736 ,1021 ,983 ,10 ,833 ,973 ,1209 ,1091 ,681 , + 1606 ,779 ,334 ,765 ,1836 ,1400 ,150 ,877 ,464 ,1487 ,870 ,1114 ,1703 ,476 ,1839 ,666 , + 914 ,1202 ,1601 ,1719 ,1670 ,412 ,568 ,1838 ,341 ,1237 ,1279 ,830 ,1815 ,32 ,1369 ,1686 , + 1307 ,419 ,1143 ,1158 ,325 ,1696 ,1597 ,93 ,795 ,4 ,1032 ,369 ,819 ,1685 ,912 ,282 , + 1372 ,1911 ,141 ,1069 ,1485 ,642 ,1370 ,702 ,284 ,1407 ,999 ,1758 ,314 ,679 ,1061 ,1624 , + 1549 ,430 ,823 ,1809 ,1976 ,232 ,727 ,266 ,747 ,253 ,134 ,267 ,93 ,428 ,731 ,1993 , + 704 ,85 ,257 ,1302 ,1141 ,1717 ,1995 ,1345 ,882 ,1350 ,1549 ,2015 ,2020 ,732 ,415 ,335 , + 1814 ,1451 ,454 ,1299 ,761 ,1736 ,1916 ,1853 ,56 ,1871 ,984 ,1273 ,247 ,1802 ,602 ,1551 , + 1922 ,47 ,564 ,893 ,34 ,131 ,1063 ,1657 ,474 ,1960 ,1049 ,1275 ,424 ,976 ,1217 ,865 , + 114 ,1000 ,725 ,1585 ,359 ,512 ,815 ,1255 ,124 ,933 ,1983 ,1136 ,1366 ,653 ,1064 ,1703 , + 2036 ,692 ,1435 ,2005 ,1465 ,37 ,892 ,511 ,1559 ,1255 ,373 ,1675 ,1085 ,1462 ,1135 ,1356 , + 483 ,156 ,1298 ,1776 ,1136 ,518 ,1826 ,872 ,431 ,215 ,1103 ,1578 ,144 ,1290 ,1508 ,1124 , + 288 ,632 ,876 ,875 ,1156 ,345 ,273 ,1774 ,1923 ,878 ,1355 ,287 ,982 ,805 ,1360 ,1688 , + 958 ,1062 ,1325 ,625 ,1720 ,1895 ,1382 ,1974 ,1868 ,1228 ,1627 ,1063 ,1617 ,614 ,834 ,1628 , + 968 ,251 ,1096 ,908 ,1938 ,112 ,895 ,1787 ,273 ,1979 ,1200 ,744 ,1994 ,402 ,1578 ,307 , + 1919 ,615 ,649 ,1539 ,2036 ,1854 ,653 ,556 ,609 ,633 ,1627 ,1820 ,1428 ,1663 ,1387 ,1725 , + 193 ,1553 ,636 ,586 ,435 ,1979 ,1226 ,945 ,1330 ,1500 ,1466 ,89 ,1563 ,1150 ,1205 ,366 , + 1179 ,1353 ,1737 ,830 ,904 ,1584 ,1596 ,1885 ,855 ,1306 ,414 ,120 ,812 ,1528 ,252 ,107 , + 1139 ,1735 ,61 ,2001 ,753 ,2034 ,354 ,1927 ,1406 ,1939 ,1009 ,430 ,1269 ,170 ,1785 ,541 , + 898 ,414 ,913 ,1563 ,719 ,1393 ,286 ,857 ,1522 ,2024 ,1845 ,779 ,121 ,1344 ,745 ,808 , + 897 ,1577 ,1497 ,186 ,1418 ,1822 ,1726 ,947 ,1782 ,1415 ,75 ,1724 ,1769 ,1529 ,1835 ,1262 , + 834 ,1214 ,685 ,461 ,526 ,1869 ,1373 ,992 ,912 ,1453 ,583 ,652 ,1637 ,798 ,1034 ,1096 , + 897 ,132 ,1010 ,1932 ,277 ,1536 ,1541 ,952 ,19 ,88 ,2042 ,1232 ,1681 ,2013 ,1241 ,1167 , + 1526 ,1487 ,761 ,308 ,1567 ,1702 ,177 ,5 ,1709 ,900 ,1699 ,1266 ,1620 ,1027 ,1102 ,1753 , + 1243 ,471 ,485 ,1765 ,391 ,1281 ,1607 ,1418 ,116 ,1702 ,1725 ,1692 ,1082 ,350 ,14 ,59 , + 386 ,882 ,2010 ,1438 ,145 ,789 ,1397 ,1921 ,1507 ,457 ,1458 ,1929 ,289 ,1305 ,965 ,500 , + 1511 ,433 ,284 ,721 ,1741 ,56 ,615 ,916 ,887 ,1253 ,916 ,535 ,1666 ,1175 ,716 ,269 , + 447 ,32 ,63 ,321 ,1860 ,1986 ,1009 ,1849 ,1062 ,471 ,2018 ,1213 ,1557 ,990 ,696 ,677 , + }; + } else { + std::ifstream fin(codes_path); + if (!fin) { + fprintf(stderr, "Error: cannot open codes file: %s\n", codes_path); + return 1; + } + std::string line; + while (std::getline(fin, line)) { + // Skip empty lines + if (line.empty()) continue; + // TODO: support both comma (with spaces) and new line + try { + int code = std::stoi(line); + codes.push_back(code); + } catch (const std::exception& e) { + fprintf(stderr, "Error parsing code: %s\n", line.c_str()); + return 1; + } + } + if (codes.empty()) { + fprintf(stderr, "Error: no codes found in file: %s\n", codes_path); + return 1; + } + + printf("Loaded %d codes from %s\n", (int)codes.size(), codes_path); + } + + mimi_model model(model_path, true); + std::vector wav_data = model.decode(codes); + + // print first 20 values + printf("Number of output samples: %d\n", (int)wav_data.size()); + printf("First 20 samples:\n"); + for (int i = 0; i < 20; i++) { + printf("%2.4f, ", wav_data[i]); + } + printf("...\n"); + + // write to wav + printf("Writing to %s\n", out_path); + save_wav16(out_path, wav_data, model.get_sample_rate()); +} diff --git a/examples/tts/tts-csm-data.h b/examples/tts/tts-csm-data.h new file mode 100644 index 0000000000000..c3c47ca7ac3a2 --- /dev/null +++ b/examples/tts/tts-csm-data.h @@ -0,0 +1,1513 @@ +#pragma once + +#include + +// https://huggingface.co/spaces/sesame/csm-1b/blob/main/prompts/conversational_a.wav +const char * default_speaker_a_text = "[0]like revising for an exam I'd have to try and like keep up the momentum because I'd start really early I'd be like okay I'm gonna start revising now and then like you're revising for ages and then I just like start losing steam I didn't do that for the exam we had recently to be fair that was a more of a last minute scenario but like yeah I'm trying to like yeah I noticed this yesterday that like Mondays I sort of start the day with this not like a panic but like a"; +std::initializer_list default_speaker_a_codes = {}; + +// https://huggingface.co/spaces/sesame/csm-1b/blob/main/prompts/conversational_b.wav +const char * default_speaker_b_text = "[1]like a super Mario level. Like it's very like high detail. And like, once you get into the park, it just like, everything looks like a computer game and they have all these, like, you know, if, if there's like a, you know, like in a Mario game, they will have like a question block. And if you like, you know, punch it, a coin will come out. So like everyone, when they come into the park, they get like this little bracelet and then you can go punching question blocks around."; +std::initializer_list default_speaker_b_codes = {}; diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp new file mode 100644 index 0000000000000..d9a5ef1102d89 --- /dev/null +++ b/examples/tts/tts-csm.cpp @@ -0,0 +1,479 @@ +#include "ggml.h" +#include "llama.h" +#include "common.h" +#include "log.h" +#include "arg.h" +#include "mimi-model.h" +#include "tts-csm-data.h" + +#include +#include +#include +#include +#include +#include // memcpy and strcmp +#include + +// For more details on how this works, see: https://github.com/ggml-org/llama.cpp/pull/12648 + +static void print_usage(int, char ** argv) { + LOG("\nExample usage:\n"); + LOG("\n By default, model will be downloaded from https://huggingface.co/ggml-org/sesame-csm-1b-GGUF"); + LOG("\n %s -p \"[0]I have a dream that one day every valley shall be exalted\" -o output.wav", argv[0]); + LOG("\n"); + LOG("\n To use a local model, specify the path to the model file:"); + LOG("\n %s -p ... -m sesame-csm-backbone.gguf -mv kyutai-mimi.gguf -o output.wav", argv[0]); + LOG("\n"); + LOG("\n Note: the model need 2 files to run, one ends with '-backbone-.gguf' and the other ends with '-decoder.gguf'"); + LOG("\n"); + LOG("\nPrompt format:"); + LOG("\n Each line must start with speaker ID in square brackets, followed by the text. One turn per line. A full stop is recommended at the end of each turn"); + LOG("\n Example:"); + LOG("\n [0]Hey how are you doing."); + LOG("\n [1]Pretty good, pretty good."); + LOG("\n If you want to enter long text, use -f file.txt to read from file"); + LOG("\n"); +} + +struct speaker_turn { + std::string text; + std::vector audio_embd; // only used for system prompt (speaker reference) processing + size_t n_embd_tokens = 0; +}; + +// split text containing "[N]..." into speaker turns +static std::vector get_speaker_turns(const std::string & input) { + if (input.empty()) { + LOG_ERR("Empty input\n"); + return {}; + } + if (input[0] != '[') { + LOG_ERR("Invalid input format: missing speaker ID\n"); + return {}; + } + std::regex re(R"((\[\d+\][\s\S]*?)(?=\[\d+\]|$))"); + std::smatch match; + std::vector turns; + std::string::const_iterator searchStart(input.cbegin()); + while (std::regex_search(searchStart, input.cend(), match, re)) { + std::string turn_text = match[1].str(); + if (turn_text.empty()) { + continue; + } + // clean up newline, the model is quite sensitive to this + string_replace_all(turn_text, "\n", " "); + turn_text = string_strip(turn_text); + // add turn + speaker_turn turn; + turn.text = turn_text; + turns.push_back(turn); + searchStart = match.suffix().first; + } + return turns; +} + +static speaker_turn get_ref_speaker_turn(const char * text, std::initializer_list & codes, std::vector & codebook) { + const size_t n_embd = 2048; + const size_t n_codes_per_codebook = 2051; + const size_t n_codebooks = 32; + GGML_ASSERT(codebook.size() == n_embd * n_codes_per_codebook * n_codebooks); + GGML_ASSERT(codes.size() % 32 == 0); + + // 1 frame = 32 codes + size_t n_frames = codes.size() / n_codebooks; + speaker_turn turn; + turn.text = text; + turn.audio_embd.reserve((n_frames+1) * n_embd); + turn.n_embd_tokens = n_frames+1; // +1 for EOS frame + + for (size_t i_fr = 0; i_fr <= n_frames; i_fr++) { + std::vector frame_embd_sum(n_embd, 0.0f); + + for (size_t i_cb = 0; i_cb < n_codebooks; i_cb++) { + const size_t code = i_fr == n_frames + ? 0 // insert audio EOS for last pseudo-frame + : codes.begin()[i_fr*n_codebooks + i_cb]; + printf("code %zu: %zu, codebook entry %zu\n", i_cb, code, i_cb*n_codes_per_codebook + code); + float * entry = codebook.data() + i_cb*n_codes_per_codebook*n_embd + code*n_embd; + for (size_t i_embd = 0; i_embd < n_embd; i_embd++) { + frame_embd_sum[i_embd] += entry[i_embd]; + } + } + + turn.audio_embd.insert(turn.audio_embd.end(), frame_embd_sum.begin(), frame_embd_sum.end()); + } + + GGML_ASSERT(turn.audio_embd.size() == (n_frames+1) * n_embd); + return turn; +} + +// sampling with custom n_vocab +// modified version of llama_sampler_sample() +static llama_token sample_token(struct llama_sampler * smpl, const float * logits, int n_vocab) { + std::vector cur; + cur.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array cur_p = { + /* .data = */ cur.data(), + /* .size = */ cur.size(), + /* .selected = */ -1, + /* .sorted = */ false, + }; + + llama_sampler_apply(smpl, &cur_p); + GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size); + auto token = cur_p.data[cur_p.selected].id; + llama_sampler_accept(smpl, token); + return token; +} + +struct hook_data { + std::vector embd; + std::vector codebook; +}; + +// hook to retrieve the embeddings +static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) { + hook_data * data = (hook_data *) user_data; + + // output_csm_proj is the embeddings output from backbone + // output_audio_embd is the embeddings output from decoder + if (t && (strcmp(t->name, "output_csm_proj") == 0 || strcmp(t->name, "output_audio_embd") == 0)) { + if (ask) return true; + + GGML_ASSERT(t->type == GGML_TYPE_F32); + data->embd.resize(ggml_nelements(t)); + ggml_backend_tensor_get(t, data->embd.data(), 0, ggml_nbytes(t)); + // printf("%s tensor size: %lld, %lld\n", t->name, t->ne[0], t->ne[1]); + return true; + } + + if (t && strncmp(t->name, "audio_embd.weight", 18) == 0) { + if (ask) return true; + + // printf("%s tensor size: %lld, %lld\n", t->name, t->ne[0], t->ne[1]); + GGML_ASSERT(t->type == GGML_TYPE_F32); + GGML_ASSERT(t->ne[0] == 2048); // backbone embd size + data->codebook.resize(ggml_nelements(t)); + ggml_backend_tensor_get(t, data->codebook.data(), 0, ggml_nbytes(t)); + return true; + } + + return false; +} + +// convenience wrapper around llama_batch to handle memory allocation +struct decode_embd_batch { + std::vector pos; + std::vector n_seq_id; + std::vector seq_id_0; + std::vector seq_ids; + std::vector logits; + llama_batch batch; + decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { + pos .resize(n_tokens); + n_seq_id.resize(n_tokens); + seq_ids .resize(n_tokens + 1); + logits .resize(n_tokens); + seq_id_0.resize(1); + seq_id_0[0] = seq_id; + seq_ids [n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ embd, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = pos_0 + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } +}; + +int main(int argc, char ** argv) { + common_params params; + + params.model.path = "sesame-csm-backbone.gguf"; + params.vocoder.model.path = "kyutai-mimi.gguf"; + params.out_file = "output.wav"; + params.prompt = ""; + params.n_predict = 2048; // CSM's max trained seq length + params.sampling.top_k = 50; // default param from CSM python code + params.sampling.temp = 0.9; // default param from CSM python code + + // HF model (hack: we temporary reuse speculative.model as the decoder model, only to get it downloaded) + params.model.url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-backbone.gguf"; + params.speculative.model.path = "sesame-csm-decoder.gguf"; + params.speculative.model.url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-decoder.gguf"; + params.vocoder.model.url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/kyutai-mimi.gguf"; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) { + return 1; + } + + llama_backend_init(); + llama_numa_init(params.numa); + + if (params.prompt.empty()) { + LOG_ERR("prompt is empty\n"); + return 1; + } + + hook_data cb_data; + params.cb_eval = ggml_callback; + params.cb_eval_user_data = &cb_data; + + common_params params_decoder(params); // duplicate the params + params_decoder.n_ctx = 64; // we never use more than this + string_replace_all(params_decoder.model.path, "-backbone", "-decoder"); + string_replace_all(params_decoder.model.url, "-backbone", "-decoder"); + + common_init_result llama_backbone = common_init_from_params(params); + llama_model * model_bb = llama_backbone.model.get(); + llama_context * ctx_bb = llama_backbone.context.get(); + + common_init_result llama_decoder = common_init_from_params(params_decoder); + llama_model * model_dc = llama_decoder.model.get(); + llama_context * ctx_dc = llama_decoder.context.get(); + + if (model_bb == nullptr || ctx_bb == nullptr) { + return ENOENT; + } + + if (model_dc == nullptr || ctx_dc == nullptr) { + return ENOENT; + } + + mimi_model mimi(params.vocoder.model.path.c_str(), true); + + // init sampler + // the python implementation only has top-k and temperature sampling, so we'll use just that + llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params())); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_top_k(params.sampling.top_k)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(params.sampling.temp)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(params.sampling.seed)); + + llama_batch batch_prompt = llama_batch_init(params.n_batch, 0, 1); + llama_pos n_past_bb = 0; + + // inp_past_embd is the "squashed" embeddings from the decoder + std::vector inp_past_embd(2048, 0.0f); + llama_batch batch_past_embd = llama_batch_init(1, inp_past_embd.size(), 1); + + int64_t t_gb_start = ggml_time_ms(); // global start time + int64_t t_bb = 0; // backbone time + int64_t n_bb_gen = 0; // backbone generation count + int64_t t_dc = 0; // decoder time + int64_t n_dc_gen = 0; // decoder generation count + + std::vector generated_codes; + + std::vector turns; + // speaker reference + turns.push_back(get_ref_speaker_turn(default_speaker_a_text, default_speaker_a_codes, cb_data.codebook)); + turns.push_back(get_ref_speaker_turn(default_speaker_b_text, default_speaker_b_codes, cb_data.codebook)); + + // user input + auto custom_turns = get_speaker_turns(params.prompt); + turns.insert(turns.end(), custom_turns.begin(), custom_turns.end()); + + for (speaker_turn & turn : turns) { + // tokenize the turn + llama_tokens prompt_tokens; + { + printf("\n---\n\nturn: %s\n\n", turn.text.c_str()); + const llama_vocab * vocab = llama_model_get_vocab(model_bb); + prompt_tokens = common_tokenize(vocab, turn.text, false, true); + prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab)); + prompt_tokens.insert(prompt_tokens.end(), llama_vocab_eos(vocab)); + + printf("prompt (%zu tokens): \n", prompt_tokens.size()); + for (size_t i = 0; i < prompt_tokens.size(); ++i) { + printf("%d, ", prompt_tokens[i]); + } + printf("\n\n"); + + common_batch_clear(batch_prompt); + for (size_t i = 0; i < prompt_tokens.size(); ++i) { + common_batch_add(batch_prompt, prompt_tokens[i], n_past_bb++, { 0 }, false); + } + batch_prompt.logits[batch_prompt.n_tokens - 1] = true; + + if (llama_decode(ctx_bb, batch_prompt) != 0) { + LOG_ERR("%s: backbone llama_decode(text) failed\n", __func__); + return 1; + } + } + + // optionally process the system prompt (speaker reference) + if (turn.n_embd_tokens) { + decode_embd_batch batch_embd(turn.audio_embd.data(), turn.n_embd_tokens, n_past_bb, 0); + if (llama_decode(ctx_bb, batch_embd.batch) != 0) { + LOG_ERR("%s: backbone llama_decode(embeddings) failed\n", __func__); + return 1; + } + LOG_INF("%s: backbone done decoding %zu audio codes\n\n", __func__, turn.n_embd_tokens); + n_past_bb += turn.n_embd_tokens; + continue; // no need to generate the audio + } + + // backbone generation loop + bool is_end_of_turn = false; + for (int k = 0; k < params.n_predict; ++k) { + bool is_first_tok = k == 0; + + if (!is_first_tok) { + // generate the next RVQ semantic token + batch_past_embd.n_tokens = 1; + batch_past_embd.pos[0] = n_past_bb++; + batch_past_embd.seq_id[0][0] = 0; + batch_past_embd.n_seq_id[0] = 1; + batch_past_embd.logits[0] = true; + std::memcpy(batch_past_embd.embd, inp_past_embd.data(), inp_past_embd.size() * sizeof(float)); + + int64_t t_bb_start = ggml_time_ms(); + if (llama_decode(ctx_bb, batch_past_embd) != 0) { + LOG_ERR("%s: backbone llama_decode() failed\n", __func__); + return 1; + } + n_bb_gen++; + t_bb += ggml_time_ms() - t_bb_start; + } + + if (is_end_of_turn) { + // done decoding audio's EOS token + break; + } + + auto vocab_dc = llama_model_get_vocab(model_dc); + auto logits = llama_get_logits_ith(ctx_bb, is_first_tok ? (batch_prompt.n_tokens - 1) : 0); + // for (size_t i = 0; i < 10; ++i) { + // printf("%4.2f, ", logits[i]); + // } + // printf("\n"); + + llama_token semantic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc)); + printf("Sem token %5d : %d,", 1+(int)generated_codes.size()/32, semantic_tok); + generated_codes.push_back(semantic_tok); + + // for (size_t i = 0; i < 10; ++i) { + // printf("%4.2f, ", embd[i]); + // } + // printf("\n"); + + + // decoder generation loop + inp_past_embd = std::vector(inp_past_embd.size(), 0.0f); + { + llama_kv_self_clear(ctx_dc); + llama_batch batch_embd = llama_batch_init(1, cb_data.embd.size(), 1); + llama_batch batch_token = llama_batch_init(1, 0, 1); + + // first "token" is the latent embeddings from backbone + { + batch_embd.n_tokens = 1; + batch_embd.pos[0] = 0; + batch_embd.seq_id[0][0] = 0; + batch_embd.n_seq_id[0] = 1; + batch_embd.logits[0] = false; + std::memcpy(batch_embd.embd, cb_data.embd.data(), cb_data.embd.size() * sizeof(float)); + } + if (llama_decode(ctx_dc, batch_embd) != 0) { + LOG_ERR("%s: decoder llama_decode(embd) failed\n", __func__); + return 1; + } + + // then, decode the semantic_tok to generate acoustic tokens + llama_token tok = semantic_tok; + int n_codes = 32; + int sum_codes = semantic_tok; // to check if all codes are 0 + for (int i = 0; i < n_codes; ++i) { + common_batch_clear(batch_token); + // encoder vocab is further divided into 32 codebooks, each with 2051 entries + llama_token inp_tok = tok + 2051*i; + common_batch_add(batch_token, inp_tok, i+1, { 0 }, true); + + int64_t t_bb_start = ggml_time_ms(); + if (llama_decode(ctx_dc, batch_token) != 0) { + LOG_ERR("%s: decoder llama_decode(token) failed\n", __func__); + return 1; + } + n_dc_gen++; + t_dc += ggml_time_ms() - t_bb_start; + + // sample the acoustic token + auto logits = llama_get_logits_ith(ctx_dc, 0); + llama_token acoustic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc)); + + // discard last code (only for embeddings) + if (i < n_codes - 1) { + printf("%d,", acoustic_tok); + tok = acoustic_tok; // next input token + sum_codes += acoustic_tok; + generated_codes.push_back(acoustic_tok); + } + + // do progressive hsum of embeddings + GGML_ASSERT(inp_past_embd.size() == cb_data.embd.size()); + for (size_t i = 0; i < inp_past_embd.size(); ++i) { + inp_past_embd[i] += cb_data.embd[i]; + } + } + printf("\n"); + + llama_batch_free(batch_embd); + llama_batch_free(batch_token); + + // if all codes are 0, then we are done (got audio EOS token) + // note: we still need to run backbone decode one more time to decode the audio's EOS token + is_end_of_turn = sum_codes == 0; + if (is_end_of_turn) { + // remove last 32 codes since they will be all zeros + generated_codes.resize(generated_codes.size() - 32); + } + } + + // printf("inp_past_embd, n_past_bb = %d\n", n_past_bb); + // for (size_t i = 0; i < inp_past_embd.size(); ++i) { + // printf("%4.4f, ", inp_past_embd[i]); + // if (i == 2) { + // printf("... "); + // i = inp_past_embd.size() - 4; + // } + // } + // printf("\n"); + } + } + + // print timing info + printf("\ntimings:\n"); + printf(" backbone: %" PRId64 " ms, %" PRId64 " generated token (%.2f tok/s)\n", t_bb, n_bb_gen, (float)n_bb_gen*1000/(float)t_bb); + printf(" decoder: %" PRId64 " ms, %" PRId64 " generated token (%.2f tok/s)\n", t_dc, n_dc_gen, (float)n_dc_gen*1000/(float)t_dc); + printf(" total: %" PRId64 " ms\n\n", ggml_time_ms() - t_gb_start); + + llama_batch_free(batch_prompt); + llama_batch_free(batch_past_embd); + + printf("decode %zu RVQ tokens into wav...\n", generated_codes.size()); + std::vector wav_data = mimi.decode(generated_codes); + + printf("output wav file: %s\n", params.out_file.c_str()); + + if (!save_wav16(params.out_file.c_str(), wav_data, mimi.get_sample_rate())) { + LOG_ERR("Failed to save wav file\n"); + return 1; + } + + printf("\n"); + + return 0; +} diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 0f047986965f8..e5e0dd4573fda 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -71,46 +71,6 @@ static void print_usage(int, char ** argv) { LOG("\n"); } -struct wav_header { - char riff[4] = {'R', 'I', 'F', 'F'}; - uint32_t chunk_size; - char wave[4] = {'W', 'A', 'V', 'E'}; - char fmt[4] = {'f', 'm', 't', ' '}; - uint32_t fmt_chunk_size = 16; - uint16_t audio_format = 1; // PCM - uint16_t num_channels = 1; // Mono - uint32_t sample_rate; - uint32_t byte_rate; - uint16_t block_align; - uint16_t bits_per_sample = 16; - char data[4] = {'d', 'a', 't', 'a'}; - uint32_t data_size; -}; - -static bool save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { - std::ofstream file(fname, std::ios::binary); - if (!file) { - LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str()); - return false; - } - - wav_header header; - header.sample_rate = sample_rate; - header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); - header.block_align = header.num_channels * (header.bits_per_sample / 8); - header.data_size = data.size() * (header.bits_per_sample / 8); - header.chunk_size = 36 + header.data_size; - - file.write(reinterpret_cast(&header), sizeof(header)); - - for (const auto & sample : data) { - int16_t pcm_sample = static_cast(std::clamp(sample * 32767.0, -32768.0, 32767.0)); - file.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); - } - - return file.good(); -} - static void fill_hann_window(int length, bool periodic, float * output) { int offset = -1; if (periodic) { diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 62e1480bb5881..c3885c41c1fa1 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -6,6 +6,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_LLAMA_CSM, "llama-csm" }, { LLM_ARCH_LLAMA4, "llama4" }, { LLM_ARCH_DECI, "deci" }, { LLM_ARCH_FALCON, "falcon" }, @@ -217,27 +218,57 @@ static const std::map> LLM_TENSOR_N { LLM_ARCH_LLAMA, { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, - { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, - { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, + { + LLM_ARCH_LLAMA_CSM, // like LLM_ARCH_LLAMA, but with extra tensors for Sesame CSM + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_CSM_AUDIO_EMBD, "audio_embd" }, + { LLM_TENSOR_CSM_CBOOK_OUTPUT, "codebook0_head" }, + { LLM_TENSOR_CSM_AUDIO_OUTPUT, "audio_head" }, + { LLM_TENSOR_CSM_PROJ, "csm_proj" }, }, }, { @@ -1676,6 +1707,10 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CSM_AUDIO_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, + {LLM_TENSOR_CSM_CBOOK_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CSM_AUDIO_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CSM_PROJ, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL_MAT}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index 98ca00a1bd0b0..cb6ebd50ff377 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -10,6 +10,7 @@ enum llm_arch { LLM_ARCH_LLAMA, + LLM_ARCH_LLAMA_CSM, LLM_ARCH_LLAMA4, LLM_ARCH_DECI, LLM_ARCH_FALCON, @@ -360,6 +361,10 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, + LLM_TENSOR_CSM_AUDIO_EMBD, + LLM_TENSOR_CSM_CBOOK_OUTPUT, + LLM_TENSOR_CSM_AUDIO_OUTPUT, + LLM_TENSOR_CSM_PROJ, }; enum llm_tensor_layer { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6b7bfecf3a1cf..cd549e986c2a9 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -508,7 +508,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { + if (arch == LLM_ARCH_LLAMA + || arch == LLM_ARCH_LLAMA_CSM + || arch == LLM_ARCH_DECI + || arch == LLM_ARCH_FALCON + ) { if (hparams.n_rot != hparams.n_embd_head_k) { throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); } @@ -526,6 +530,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // arch-specific KVs switch (arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_LLAMA_CSM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -1738,6 +1743,48 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } } break; + case LLM_ARCH_LLAMA_CSM: + { + // TODO: maybe store these in gguf metadata + int64_t csm_audio_cbook_size = 2051; // audio codebook size + int64_t csm_audio_tokens = 32; // equal to number of audio tokens for Mimi + //int64_t csm_n_audio_vocab = csm_audio_cbook_size*csm_acoustic_tokens; + + csm_output_cbook = create_tensor(tn(LLM_TENSOR_CSM_CBOOK_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size}, TENSOR_NOT_REQUIRED); + + bool is_backbone = csm_output_cbook != nullptr; + + csm_output_audio = is_backbone ? nullptr + : create_tensor(tn(LLM_TENSOR_CSM_AUDIO_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size, csm_audio_tokens+1}, 0); + + tok_embd = is_backbone + ? create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0) + : create_tensor(tn(LLM_TENSOR_CSM_AUDIO_EMBD, "weight"), {n_embd*2, n_vocab}, 0); + + csm_proj = is_backbone + ? create_tensor(tn(LLM_TENSOR_CSM_PROJ, "weight"), {n_embd, n_embd/2}, 0) + : create_tensor(tn(LLM_TENSOR_CSM_PROJ, "weight"), {n_embd*2, n_embd}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + // output tensor is either audio or code depends on backbone / decoder + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + } break; case LLM_ARCH_LLAMA4: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -1765,6 +1812,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); @@ -4625,7 +4675,7 @@ struct llm_build_llama : public llm_graph_context { cb(cur, "result_norm", -1); res->t_embd = cur; - // lm_head + // lm_head (normal case) cur = build_lora_mm(model.output, cur); // For Granite architecture @@ -4640,6 +4690,192 @@ struct llm_build_llama : public llm_graph_context { } }; +// llama used by Sesame CSM +struct llm_build_llama_csm : public llm_graph_context { + llm_build_llama_csm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + bool is_backbone = model.csm_output_cbook; + bool is_decoder = !is_backbone; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // hacky way to get the audio embedding from user code (used in prompt processing) + // this will be triggered during warmup + if (is_decoder && n_tokens == 2) { + ggml_tensor * tmp = ggml_cast(ctx0, model.tok_embd, GGML_TYPE_F32); + cb(tmp, "audio_embd.weight", -1); + ggml_build_forward_expand(gf, tmp); + } + + ggml_tensor * input_embd = inpL; + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + if (is_decoder && inpL->ne[0] != hparams.n_embd) { + inpL = build_lora_mm(model.csm_proj, inpL); + } + + auto * inp_attn = build_attn_inp_kv_unified(); + + const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models + ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, gf, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + if (model.layers[il].ffn_gate_inp == nullptr) { + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + if (model.csm_output_cbook) { + // Sesame csm backbone + // hack: because n_cbook < n_vocab, we use the first logits for the output + int64_t n_vocab = model.tok_embd->ne[1]; + int64_t n_codes = model.csm_output_cbook->ne[1]; + ggml_tensor * last_h = cur; + cur = build_lora_mm(model.csm_output_cbook, cur); + cur = ggml_pad(ctx0, cur, n_vocab - n_codes, 0, 0, 0); + + // project to csm decoder dim + last_h = build_lora_mm(model.csm_proj, last_h); + cb(last_h, "output_csm_proj", -1); // use callback to retrieve the result + ggml_build_forward_expand(gf, last_h); + + } else if (model.csm_output_audio && ggml_nelements(cur)) { + // Sesame csm decoder + // hack: because n_audio < n_vocab, we use the first logits for the output + cur = build_lora_mm_id(model.csm_output_audio, cur, inp_pos); + int64_t n_vocab = model.tok_embd->ne[1]; + int64_t n_codes = cur->ne[0]; + cur = ggml_pad(ctx0, cur, n_vocab - n_codes, cur->ne[1], 0, 0); + + // also get audio embeddings, which will be passed back to backbone to keep track of generation progress + if (ubatch.token) { + cb(input_embd, "output_audio_embd", -1); + ggml_build_forward_expand(gf, input_embd); + } + + } else { + // otherwise, dummy output + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + struct llm_build_deci : public llm_graph_context { llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -12815,6 +13051,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_LLAMA_CSM: + { + llm = std::make_unique(*this, params, gf); + } break; case LLM_ARCH_DECI: { llm = std::make_unique(*this, params, gf); @@ -13170,6 +13410,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { // use what we call a normal RoPE, operating on pairs of consecutive head values case LLM_ARCH_LLAMA: + case LLM_ARCH_LLAMA_CSM: case LLM_ARCH_LLAMA4: case LLM_ARCH_DECI: case LLM_ARCH_BAICHUAN: diff --git a/src/llama-model.h b/src/llama-model.h index fd82d106ccda8..1527c1ea7705c 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -342,6 +342,11 @@ struct llama_model { struct ggml_tensor * conv1d = nullptr; struct ggml_tensor * conv1d_b = nullptr; + // sesame csm + struct ggml_tensor * csm_output_cbook = nullptr; // backbone output codebook + struct ggml_tensor * csm_output_audio = nullptr; // audio decoder output + struct ggml_tensor * csm_proj = nullptr; // to convert backbone dim to decoder dim + std::vector layers; llama_model_params params;