@@ -214,23 +214,23 @@ jobs:
214
214
--files "tokenizer.model" "params.json" "consolidated.00.pth"
215
215
)
216
216
# Export using ExecuTorch's model definition
217
- python -m examples.models.llama.export_llama \
218
- --model "llama3_2" \
219
- -- checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
220
- -- params "${DOWNLOADED_PATH}/params.json" \
221
- -- use_sdpa_with_kv_cache \
222
- -X \
223
- -- xnnpack-extended-ops \
224
- -- preq_mode 8da4w_output_8da8w \
225
- -- preq_group_size 32 \
226
- -- max_seq_length 2048 \
227
- -- max_context_length 2048 \
228
- -- output_name "${OUT_ET_MODEL_NAME}.pte" \
229
- -kv \
230
- -d fp32 \
231
- -- preq_embedding_quantize 8,0 \
232
- -- use_spin_quant native \
233
- -- metadata '{ "get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
217
+ python -m extension.llm.export.export_llm \
218
+ base.model_class= "llama3_2" \
219
+ base. checkpoint= "${DOWNLOADED_PATH}/consolidated.00.pth" \
220
+ base. params= "${DOWNLOADED_PATH}/params.json" \
221
+ model. use_sdpa_with_kv_cache=true \
222
+ backend.xnnpack.enabled=true \
223
+ backend. xnnpack.extended_ops=true \
224
+ base. preq_mode=" 8da4w_output_8da8w" \
225
+ base. preq_group_size= 32 \
226
+ export. max_seq_length= 2048 \
227
+ export. max_context_length= 2048 \
228
+ export. output_name= "${OUT_ET_MODEL_NAME}.pte" \
229
+ model.use_kv_cache=true \
230
+ model.dtype_override= fp32 \
231
+ base. preq_embedding_quantize=\' 8,0\' \
232
+ quantization. use_spin_quant= native \
233
+ base. metadata='"{\ "get_bos_id\ ":128000,\ "get_eos_ids\ ":[128009,128001]}" '
234
234
ls -lh "${OUT_ET_MODEL_NAME}.pte"
235
235
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
236
236
# QAT + LoRA
@@ -241,53 +241,55 @@ jobs:
241
241
--files "tokenizer.model" "params.json" "consolidated.00.pth"
242
242
)
243
243
# Export using ExecuTorch's model definition
244
- python -m examples.models.llama.export_llama \
245
- --model "llama3_2" \
246
- -- checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
247
- -- params "${DOWNLOADED_PATH}/params.json" \
248
- -qat \
249
- -lora 16 \
250
- -- preq_mode 8da4w_output_8da8w \
251
- -- preq_group_size 32 \
252
- -- preq_embedding_quantize 8,0 \
253
- -- use_sdpa_with_kv_cache \
254
- -kv \
255
- -X \
256
- -- xnnpack-extended-ops \
257
- -d fp32 \
258
- -- max_seq_length 2048 \
259
- -- max_context_length 2048 \
260
- -- output_name "${OUT_ET_MODEL_NAME}.pte" \
261
- -- metadata '{ "get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
244
+ python -m extension.llm.export.export_llm \
245
+ base.model_class= "llama3_2" \
246
+ base. checkpoint= "${DOWNLOADED_PATH}/consolidated.00.pth" \
247
+ base. params= "${DOWNLOADED_PATH}/params.json" \
248
+ quantization.use_qat=true \
249
+ base.use_lora= 16 \
250
+ base. preq_mode=" 8da4w_output_8da8w" \
251
+ base. preq_group_size= 32 \
252
+ base. preq_embedding_quantize=\' 8,0\' \
253
+ model. use_sdpa_with_kv_cache=true \
254
+ model.use_kv_cache=true \
255
+ backend.xnnpack.enabled=true \
256
+ backend. xnnpack.extended_ops=true \
257
+ model.dtype_override= fp32 \
258
+ export. max_seq_length= 2048 \
259
+ export. max_context_length= 2048 \
260
+ export. output_name= "${OUT_ET_MODEL_NAME}.pte" \
261
+ base. metadata='"{\ "get_bos_id\ ":128000,\ "get_eos_ids\ ":[128009,128001]}" '
262
262
ls -lh "${OUT_ET_MODEL_NAME}.pte"
263
263
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
264
264
# Original BF16 version, without any quantization
265
265
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
266
- python -m examples.models.llama.export_llama \
267
- --model "llama3_2" \
268
- -- checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
269
- -- params "${DOWNLOADED_PATH}/params.json" \
270
- -kv \
271
- -- use_sdpa_with_kv_cache \
272
- -X \
273
- -d bf16 \
274
- -- metadata '{ "get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
275
- -- output_name="${OUT_ET_MODEL_NAME}.pte"
266
+ python -m extension.llm.export.export_llm \
267
+ base.model_class= "llama3_2" \
268
+ base. checkpoint= "${DOWNLOADED_PATH}/consolidated.00.pth" \
269
+ base. params= "${DOWNLOADED_PATH}/params.json" \
270
+ model.use_kv_cache=true \
271
+ model. use_sdpa_with_kv_cache=true \
272
+ backend.xnnpack.enabled=true \
273
+ model.dtype_override= bf16 \
274
+ base. metadata='"{\ "get_bos_id\ ":128000,\ "get_eos_ids\ ":[128009,128001]}" ' \
275
+ export. output_name="${OUT_ET_MODEL_NAME}.pte"
276
276
ls -lh "${OUT_ET_MODEL_NAME}.pte"
277
277
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
278
278
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
279
- python -m examples.models.llama.export_llama \
280
- --model llama3_2 \
281
- --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
282
- --params "${DOWNLOADED_PATH}/params.json" \
283
- -kv \
284
- --use_sdpa_with_kv_cache \
285
- -d fp32 \
286
- -X \
287
- --xnnpack-extended-ops \
288
- -qmode 8da4w -G 32 -E 8,0 \
289
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
290
- --output_name="${OUT_ET_MODEL_NAME}.pte"
279
+ python -m extension.llm.export.export_llm \
280
+ base.model_class=llama3_2 \
281
+ base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
282
+ base.params="${DOWNLOADED_PATH}/params.json" \
283
+ model.use_kv_cache=true \
284
+ model.use_sdpa_with_kv_cache=true \
285
+ model.dtype_override=fp32 \
286
+ backend.xnnpack.enabled=true \
287
+ backend.xnnpack.extended_ops=true \
288
+ quantization.qmode=8da4w \
289
+ quantization.group_size=32 \
290
+ quantization.embedding_quantize=\'8,0\' \
291
+ base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
292
+ export.output_name="${OUT_ET_MODEL_NAME}.pte"
291
293
ls -lh "${OUT_ET_MODEL_NAME}.pte"
292
294
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
293
295
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
@@ -313,19 +315,19 @@ jobs:
313
315
elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
314
316
if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
315
317
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
316
- python -m examples.models.llama.export_llama \
317
- --model qwen3-0_6b \
318
- -- params examples/models/qwen3/0_6b_config.json \
319
- -kv \
320
- -- use_sdpa_with_kv_cache \
321
- -d fp32 \
322
- -X \
323
- -- xnnpack-extended-ops \
324
- - qmode 8da4w \
325
- -G 32 \
326
- -E 8,0 \
327
- -- metadata '{ "get_bos_id": 151644, "get_eos_ids":[151645]}' \
328
- -- output_name="${OUT_ET_MODEL_NAME}.pte"
318
+ python -m extension.llm.export.export_llm \
319
+ base.model_class=qwen3_0_6b \
320
+ base. params= examples/models/qwen3/0_6b_config.json \
321
+ model.use_kv_cache=true \
322
+ model. use_sdpa_with_kv_cache=true \
323
+ model.dtype_override= fp32 \
324
+ backend.xnnpack.enabled=true \
325
+ backend. xnnpack.extended_ops=true \
326
+ quantization. qmode= 8da4w \
327
+ quantization.group_size= 32 \
328
+ quantization.embedding_quantize=\' 8,0\' \
329
+ base. metadata='"{\ "get_bos_id\": 151644,\ "get_eos_ids\ ":[151645]}" ' \
330
+ export. output_name="${OUT_ET_MODEL_NAME}.pte"
329
331
ls -lh "${OUT_ET_MODEL_NAME}.pte"
330
332
fi
331
333
fi
0 commit comments