Skip to content

Commit f072e64

Browse files
authored
Use export_llm in CI (#11836)
Update CI to use the new `export_llm` instead of the old `export_llama`.
1 parent 2ed84c5 commit f072e64

File tree

11 files changed

+210
-204
lines changed

11 files changed

+210
-204
lines changed

.ci/scripts/test_llama.sh

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,7 @@ PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"
5454
# Default CMake Build Type to release mode
5555
CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
5656

57-
if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
58-
echo "Expecting atleast 4 positional arguments"
59-
echo "Usage: [...]"
60-
fi
57+
# Argument validation is done individually below for each required parameter
6158
if [[ -z "${MODEL_NAME:-}" ]]; then
6259
echo "Missing model name, exiting..."
6360
exit 1
@@ -224,34 +221,34 @@ fi
224221
# Export model.
225222
EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
226223
echo "Exporting ${EXPORTED_MODEL_NAME}"
227-
EXPORT_ARGS="-c ${CHECKPOINT_FILE_NAME} -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
224+
EXPORT_ARGS="base.checkpoint=${CHECKPOINT_FILE_NAME} base.params=${PARAMS} model.dtype_override=${DTYPE} export.output_name=${EXPORTED_MODEL_NAME} model.use_kv_cache=true"
228225
if [[ "${XNNPACK}" == "ON" ]]; then
229-
EXPORT_ARGS="${EXPORT_ARGS} -X --xnnpack-extended-ops -qmode 8da4w -G 128"
226+
EXPORT_ARGS="${EXPORT_ARGS} backend.xnnpack.enabled=true backend.xnnpack.extended_ops=true quantization.qmode=8da4w quantization.group_size=128"
230227
fi
231228
if [[ "${CUSTOM}" == "ON" ]]; then
232-
EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
229+
EXPORT_ARGS="${EXPORT_ARGS} model.use_sdpa_with_kv_cache=true"
233230
fi
234231
if [[ "${QE}" == "ON" ]]; then
235-
EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024"
232+
EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=\"8,1024\""
236233
fi
237234
if [[ "${MPS}" == "ON" ]]; then
238-
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --mps --disable_dynamic_shape"
235+
EXPORT_ARGS="${EXPORT_ARGS} backend.mps.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
239236
fi
240237
if [[ "${COREML}" == "ON" ]]; then
241-
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --coreml --disable_dynamic_shape"
238+
EXPORT_ARGS="${EXPORT_ARGS} backend.coreml.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
242239
fi
243240
if [[ "${QNN}" == "ON" ]]; then
244-
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
241+
EXPORT_ARGS="${EXPORT_ARGS} backend.qnn.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
245242
echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
246243
if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
247-
EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
244+
EXPORT_ARGS+=" base.tokenizer_path=tokenizer.model quantization.pt2e_quantize=qnn_16a16w quantization.calibration_tasks=[\"wikitext\"] quantization.calibration_limit=1 quantization.calibration_seq_length=128 quantization.calibration_data=\"Once\""
248245
fi
249246
fi
250247
if [[ "${QUANTIZE_KV_CACHE}" == "ON" ]]; then
251-
EXPORT_ARGS="${EXPORT_ARGS} --quantize_kv_cache"
248+
EXPORT_ARGS="${EXPORT_ARGS} model.quantize_kv_cache=true"
252249
fi
253250
# Add dynamically linked library location
254-
$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
251+
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm ${EXPORT_ARGS}
255252

256253
# Create tokenizer.bin.
257254
echo "Creating tokenizer.bin"

.ci/scripts/test_llama_torchao_lowbit.sh

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,16 +70,16 @@ QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
7070
QEMBEDDING_BITWIDTH=4 # Can be 1-8
7171
QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
7272

73-
${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
74-
--checkpoint "${LLAMA_CHECKPOINT:?}" \
75-
--params "${LLAMA_PARAMS:?}" \
76-
-kv \
77-
--use_sdpa_with_kv_cache \
78-
--output_name=${MODEL_OUT} \
79-
-qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
80-
--group_size ${QLINEAR_GROUP_SIZE} \
81-
-E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
82-
-d fp32
73+
${PYTHON_EXECUTABLE} -m extension.llm.export.export_llm \
74+
base.checkpoint="${LLAMA_CHECKPOINT:?}" \
75+
base.params="${LLAMA_PARAMS:?}" \
76+
model.use_kv_cache=true \
77+
model.use_sdpa_with_kv_cache=true \
78+
export.output_name="${MODEL_OUT}" \
79+
quantization.qmode="torchao:8da${QLINEAR_BITWIDTH}w" \
80+
quantization.group_size=${QLINEAR_GROUP_SIZE} \
81+
quantization.embedding_quantize=\"torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}\" \
82+
model.dtype_override=fp32
8383

8484
# Test run
8585
./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"

.ci/scripts/test_model.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ test_model() {
8686
if [[ "${MODEL_NAME}" == "llama2" ]]; then
8787
# Install requirements for export_llama
8888
bash examples/models/llama/install_requirements.sh
89-
# Test export_llama script: python3 -m examples.models.llama.export_llama
90-
"${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json
89+
# Test export_llm script: python3 -m extension.llm.export.export_llm
90+
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.checkpoint=examples/models/llama/params/demo_rand_params.pth base.params=examples/models/llama/params/demo_config.json
9191
run_portable_executor_runner
9292
rm "./${MODEL_NAME}.pte"
9393
fi
@@ -100,17 +100,17 @@ test_model() {
100100
if [[ "${MODEL_NAME}" == "qwen2_5" ]]; then
101101
# Install requirements for export_llama
102102
bash examples/models/llama/install_requirements.sh
103-
# Test export_llama script: python3 -m examples.models.llama.export_llama.
103+
# Test export_llm script: python3 -m extension.llm.export.export_llm.
104104
# Use Llama random checkpoint with Qwen 2.5 1.5b model configuration.
105-
"${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/qwen2_5/1_5b_config.json
105+
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json
106106
rm "./${MODEL_NAME}.pte"
107107
return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
108108
fi
109109
if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then
110110
# Install requirements for export_llama
111111
bash examples/models/llama/install_requirements.sh
112-
# Test export_llama script: python3 -m examples.models.llama.export_llama.
113-
"${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/phi_4_mini/config.json
112+
# Test export_llm script: python3 -m extension.llm.export.export_llm.
113+
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json
114114
run_portable_executor_runner
115115
rm "./${MODEL_NAME}.pte"
116116
return

.github/workflows/android-perf.yml

Lines changed: 72 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -214,23 +214,23 @@ jobs:
214214
--files "tokenizer.model" "params.json" "consolidated.00.pth"
215215
)
216216
# Export using ExecuTorch's model definition
217-
python -m examples.models.llama.export_llama \
218-
--model "llama3_2" \
219-
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
220-
--params "${DOWNLOADED_PATH}/params.json" \
221-
--use_sdpa_with_kv_cache \
222-
-X \
223-
--xnnpack-extended-ops \
224-
--preq_mode 8da4w_output_8da8w \
225-
--preq_group_size 32 \
226-
--max_seq_length 2048 \
227-
--max_context_length 2048 \
228-
--output_name "${OUT_ET_MODEL_NAME}.pte" \
229-
-kv \
230-
-d fp32 \
231-
--preq_embedding_quantize 8,0 \
232-
--use_spin_quant native \
233-
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
217+
python -m extension.llm.export.export_llm \
218+
base.model_class="llama3_2" \
219+
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
220+
base.params="${DOWNLOADED_PATH}/params.json" \
221+
model.use_sdpa_with_kv_cache=true \
222+
backend.xnnpack.enabled=true \
223+
backend.xnnpack.extended_ops=true \
224+
base.preq_mode="8da4w_output_8da8w" \
225+
base.preq_group_size=32 \
226+
export.max_seq_length=2048 \
227+
export.max_context_length=2048 \
228+
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
229+
model.use_kv_cache=true \
230+
model.dtype_override=fp32 \
231+
base.preq_embedding_quantize=\'8,0\' \
232+
quantization.use_spin_quant=native \
233+
base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
234234
ls -lh "${OUT_ET_MODEL_NAME}.pte"
235235
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
236236
# QAT + LoRA
@@ -241,53 +241,55 @@ jobs:
241241
--files "tokenizer.model" "params.json" "consolidated.00.pth"
242242
)
243243
# Export using ExecuTorch's model definition
244-
python -m examples.models.llama.export_llama \
245-
--model "llama3_2" \
246-
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
247-
--params "${DOWNLOADED_PATH}/params.json" \
248-
-qat \
249-
-lora 16 \
250-
--preq_mode 8da4w_output_8da8w \
251-
--preq_group_size 32 \
252-
--preq_embedding_quantize 8,0 \
253-
--use_sdpa_with_kv_cache \
254-
-kv \
255-
-X \
256-
--xnnpack-extended-ops \
257-
-d fp32 \
258-
--max_seq_length 2048 \
259-
--max_context_length 2048 \
260-
--output_name "${OUT_ET_MODEL_NAME}.pte" \
261-
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
244+
python -m extension.llm.export.export_llm \
245+
base.model_class="llama3_2" \
246+
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
247+
base.params="${DOWNLOADED_PATH}/params.json" \
248+
quantization.use_qat=true \
249+
base.use_lora=16 \
250+
base.preq_mode="8da4w_output_8da8w" \
251+
base.preq_group_size=32 \
252+
base.preq_embedding_quantize=\'8,0\' \
253+
model.use_sdpa_with_kv_cache=true \
254+
model.use_kv_cache=true \
255+
backend.xnnpack.enabled=true \
256+
backend.xnnpack.extended_ops=true \
257+
model.dtype_override=fp32 \
258+
export.max_seq_length=2048 \
259+
export.max_context_length=2048 \
260+
export.output_name="${OUT_ET_MODEL_NAME}.pte" \
261+
base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
262262
ls -lh "${OUT_ET_MODEL_NAME}.pte"
263263
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
264264
# Original BF16 version, without any quantization
265265
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
266-
python -m examples.models.llama.export_llama \
267-
--model "llama3_2" \
268-
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
269-
--params "${DOWNLOADED_PATH}/params.json" \
270-
-kv \
271-
--use_sdpa_with_kv_cache \
272-
-X \
273-
-d bf16 \
274-
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
275-
--output_name="${OUT_ET_MODEL_NAME}.pte"
266+
python -m extension.llm.export.export_llm \
267+
base.model_class="llama3_2" \
268+
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
269+
base.params="${DOWNLOADED_PATH}/params.json" \
270+
model.use_kv_cache=true \
271+
model.use_sdpa_with_kv_cache=true \
272+
backend.xnnpack.enabled=true \
273+
model.dtype_override=bf16 \
274+
base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
275+
export.output_name="${OUT_ET_MODEL_NAME}.pte"
276276
ls -lh "${OUT_ET_MODEL_NAME}.pte"
277277
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
278278
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
279-
python -m examples.models.llama.export_llama \
280-
--model llama3_2 \
281-
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
282-
--params "${DOWNLOADED_PATH}/params.json" \
283-
-kv \
284-
--use_sdpa_with_kv_cache \
285-
-d fp32 \
286-
-X \
287-
--xnnpack-extended-ops \
288-
-qmode 8da4w -G 32 -E 8,0 \
289-
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
290-
--output_name="${OUT_ET_MODEL_NAME}.pte"
279+
python -m extension.llm.export.export_llm \
280+
base.model_class=llama3_2 \
281+
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
282+
base.params="${DOWNLOADED_PATH}/params.json" \
283+
model.use_kv_cache=true \
284+
model.use_sdpa_with_kv_cache=true \
285+
model.dtype_override=fp32 \
286+
backend.xnnpack.enabled=true \
287+
backend.xnnpack.extended_ops=true \
288+
quantization.qmode=8da4w \
289+
quantization.group_size=32 \
290+
quantization.embedding_quantize=\'8,0\' \
291+
base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
292+
export.output_name="${OUT_ET_MODEL_NAME}.pte"
291293
ls -lh "${OUT_ET_MODEL_NAME}.pte"
292294
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
293295
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
@@ -313,19 +315,19 @@ jobs:
313315
elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
314316
if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
315317
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
316-
python -m examples.models.llama.export_llama \
317-
--model qwen3-0_6b \
318-
--params examples/models/qwen3/0_6b_config.json \
319-
-kv \
320-
--use_sdpa_with_kv_cache \
321-
-d fp32 \
322-
-X \
323-
--xnnpack-extended-ops \
324-
-qmode 8da4w \
325-
-G 32 \
326-
-E 8,0 \
327-
--metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
328-
--output_name="${OUT_ET_MODEL_NAME}.pte"
318+
python -m extension.llm.export.export_llm \
319+
base.model_class=qwen3_0_6b \
320+
base.params=examples/models/qwen3/0_6b_config.json \
321+
model.use_kv_cache=true \
322+
model.use_sdpa_with_kv_cache=true \
323+
model.dtype_override=fp32 \
324+
backend.xnnpack.enabled=true \
325+
backend.xnnpack.extended_ops=true \
326+
quantization.qmode=8da4w \
327+
quantization.group_size=32 \
328+
quantization.embedding_quantize=\'8,0\' \
329+
base.metadata='"{\"get_bos_id\":151644,\"get_eos_ids\":[151645]}"' \
330+
export.output_name="${OUT_ET_MODEL_NAME}.pte"
329331
ls -lh "${OUT_ET_MODEL_NAME}.pte"
330332
fi
331333
fi

0 commit comments

Comments
 (0)