diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py
index dcffa42ac3f..9d81b7f8e29 100644
--- a/backends/xnnpack/operators/__init__.py
+++ b/backends/xnnpack/operators/__init__.py
@@ -30,6 +30,7 @@
     op_minimum,
     op_multiply,
     op_negate,
+    op_permute,
     op_prelu,
     op_quantize_per_tensor,
     op_relu,
@@ -42,7 +43,6 @@
     op_squeeze,
     op_static_constant_pad,
     op_static_resize_bilinear_2d,
-    op_static_transpose,
     op_sub,
     op_to_copy,
 )
diff --git a/backends/xnnpack/operators/op_static_transpose.py b/backends/xnnpack/operators/op_permute.py
similarity index 97%
rename from backends/xnnpack/operators/op_static_transpose.py
rename to backends/xnnpack/operators/op_permute.py
index ce1cd43c1ad..0ca92a7a039 100644
--- a/backends/xnnpack/operators/op_static_transpose.py
+++ b/backends/xnnpack/operators/op_permute.py
@@ -20,7 +20,7 @@
 
 
 @register_node_visitor
-class StaticTransposeVisitor(NodeVisitor):
+class PermuteVisitor(NodeVisitor):
     target = "aten.permute_copy.default"
 
     def __init__(self, *args) -> None:
diff --git a/backends/xnnpack/operators/op_skip_ops.py b/backends/xnnpack/operators/op_skip_ops.py
index 83b6eee32b0..345b7896d34 100644
--- a/backends/xnnpack/operators/op_skip_ops.py
+++ b/backends/xnnpack/operators/op_skip_ops.py
@@ -113,12 +113,3 @@ class OpSymSizeInt(OpSkipOps):
     """
 
     target = "sym_size.int"
-
-
-@register_node_visitor
-class OpPermuteCopyDefault(OpSkipOps):
-    """
-    do nothing if node is permute_copy.default
-    """
-
-    target = "aten.permute_copy.default"
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index 0c1c9e6d42c..8498bd84c5f 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -1517,6 +1517,7 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
   if (!executor->qinputs_.empty() && flatbuffer_graph->xnodes()->size() > 0 &&
       flatbuffer_graph->xnodes()->Get(0)->xnode_union_type() ==
           fb_xnnpack::XNodeUnion::XNNFullyConnected) {
+#ifdef ENABLE_DYNAMIC_QUANTIZATION
     // This delegate is for DQLinear which supports dynamic input shapes
     if (executor->getNumInputs() < 1 || executor->getNumOutputs() != 1) {
       ET_LOG(
@@ -1525,6 +1526,10 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
       return Error::NotSupported;
     }
     executor->setNeedsResizeOutput();
+#else
+    ET_LOG(Error, "DQ Linear is not supported");
+    return Error::NotSupported;
+#endif
   }
 
   return err;
diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp
index 5e39c86c1ba..30b60ee329d 100644
--- a/backends/xnnpack/runtime/XNNExecutor.cpp
+++ b/backends/xnnpack/runtime/XNNExecutor.cpp
@@ -7,7 +7,9 @@
  */
 
 #include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
+#ifdef ENABLE_DYNAMIC_QUANTIZATION
 #include <executorch/backends/xnnpack/runtime/utils/utils.h>
+#endif
 
 namespace torch {
 namespace executor {
@@ -17,6 +19,7 @@ namespace delegate {
 Error XNNExecutor::set_external_input(uint32_t id, Tensor* input) {
   auto qinput_pair = qinputs_.find(id);
   if (qinput_pair != qinputs_.end()) {
+#ifdef ENABLE_DYNAMIC_QUANTIZATION
     auto qinput = qinput_pair->second;
     // dq the input and copy it in to qinput
     float input_min, input_max;
@@ -60,6 +63,10 @@ Error XNNExecutor::set_external_input(uint32_t id, Tensor* input) {
         {static_cast<float>(input_qparam.scale),
          static_cast<int8_t>(input_qparam.zero_point)},
         batch_size});
+#else
+    ET_LOG(Error, "Dynamic Quantization is not supported");
+    return Error::NotSupported;
+#endif
   } else {
     externals_.emplace_back(xnn_external_value{id, input->mutable_data_ptr()});
   }
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
index 6df70f654aa..0f80af77cd0 100644
--- a/backends/xnnpack/targets.bzl
+++ b/backends/xnnpack/targets.bzl
@@ -65,6 +65,7 @@ def define_common_targets():
             "//executorch/extension/pybindings/...",
             "@EXECUTORCH_CLIENTS",
         ],
+        preprocessor_flags = [] if runtime.is_oss else ["-DENABLE_DYNAMIC_QUANTIZATION"],
         deps = [
             third_party_dep("XNNPACK"),
             ":xnnpack_schema",
diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS
index 6ccc8d4c345..3305f259931 100644
--- a/backends/xnnpack/test/TARGETS
+++ b/backends/xnnpack/test/TARGETS
@@ -123,3 +123,16 @@ python_unittest(
         "//executorch/backends/xnnpack/test/tester:tester",
     ],
 )
+
+python_unittest(
+    name = "test_xnnpack_models",
+    srcs = glob([
+        "models/*.py",
+    ]),
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
+        "//executorch/backends/xnnpack/test/tester:tester",
+        "//pytorch/vision:torchvision",
+    ],
+)
diff --git a/backends/xnnpack/test/models/mobilenet_v2.py b/backends/xnnpack/test/models/mobilenet_v2.py
new file mode 100644
index 00000000000..5f7c72dab9e
--- /dev/null
+++ b/backends/xnnpack/test/models/mobilenet_v2.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+import torchvision.models as models
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
+    XnnpackQuantizedPartitioner2,
+)
+from executorch.backends.xnnpack.test.tester import Partition, Tester
+from executorch.backends.xnnpack.test.tester.tester import Export
+from executorch.backends.xnnpack.utils.configs import get_xnnpack_capture_config
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+
+
+class TestMobileNetV2(unittest.TestCase):
+    export_stage = Export(get_xnnpack_capture_config(enable_aot=True))
+
+    mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights)
+    mv2 = mv2.eval()
+    model_inputs = (torch.ones(1, 3, 224, 244),)
+
+    all_operators = {
+        "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default",
+        "executorch_exir_dialects_edge__ops_aten_add_Tensor",
+        "executorch_exir_dialects_edge__ops_aten_permute_copy_default",
+        "executorch_exir_dialects_edge__ops_aten_addmm_default",
+        "executorch_exir_dialects_edge__ops_aten_mean_dim",
+        "executorch_exir_dialects_edge__ops_aten_hardtanh_default",
+        "executorch_exir_dialects_edge__ops_aten_convolution_default",
+    }
+
+    def test_fp32(self):
+
+        (
+            Tester(self.mv2, self.model_inputs)
+            .export(self.export_stage)
+            .to_edge()
+            .check(list(self.all_operators))
+            .partition()
+            .check(["torch.ops.executorch_call_delegate"])
+            .check_not(list(self.all_operators))
+            .to_executorch()
+            .serialize()
+            .run_method()
+            .compare_outputs()
+        )
+
+    def test_qs8_pt2e(self):
+        # Quantization fuses away batchnorm, so it is no longer in the graph
+        ops_after_quantization = self.all_operators - {
+            "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default",
+        }
+
+        (
+            Tester(self.mv2, self.model_inputs)
+            .quantize2()
+            .export(self.export_stage)
+            .to_edge()
+            .check(list(ops_after_quantization))
+            .partition(Partition(partitioner=XnnpackQuantizedPartitioner2))
+            .check(["torch.ops.executorch_call_delegate"])
+            .check_not(list(ops_after_quantization))
+            .to_executorch()
+            .serialize()
+            .run_method()
+            .compare_outputs()
+        )
diff --git a/backends/xnnpack/test/ops/add.py b/backends/xnnpack/test/ops/add.py
index ee19be67cdd..fe7686d1f99 100644
--- a/backends/xnnpack/test/ops/add.py
+++ b/backends/xnnpack/test/ops/add.py
@@ -75,9 +75,9 @@ def test_add_quantized_pt2e(self):
 
         (
             Tester(add_module, model_inputs)
+            .quantize2()
             .export()
             .check_count({"torch.ops.aten.add.Tensor": 4})
-            .quantize2()
             .check(["torch.ops.quantized_decomposed"])
             .to_edge()
             .check_count({"executorch_exir_dialects_edge__ops_aten_add_Tensor": 4})
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
index 23602bde8bf..a736284bd9d 100644
--- a/backends/xnnpack/test/tester/tester.py
+++ b/backends/xnnpack/test/tester/tester.py
@@ -10,6 +10,7 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
+import torch._export as export
 from executorch import exir
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
     XnnpackFloatingPointPartitioner,
@@ -145,23 +146,23 @@ def __init__(
 
         self.quantizer.set_global(self.quantization_config)
 
-        self.converted_program = None
+        self.converted_graph = None
 
     def run(
-        self, artifact: ExirExportedProgram, inputs: Optional[Tuple[torch.Tensor]]
+        self, artifact: torch.nn.Module, inputs: Optional[Tuple[torch.Tensor]]
     ) -> None:
-        prepared = prepare_pt2e(artifact.exported_program.graph_module, self.quantizer)
+        captured_graph = export.capture_pre_autograd_graph(artifact, inputs)
+        prepared = prepare_pt2e(captured_graph, self.quantizer)
         converted = convert_pt2e(prepared)
-        artifact.exported_program._graph_module = converted
-        self.converted_program = artifact
+        self.converted_graph = converted
 
     @property
-    def artifact(self) -> ExirExportedProgram:
-        return self.converted_program
+    def artifact(self) -> torch.fx.GraphModule:
+        return self.converted_graph
 
     @property
     def graph_module(self) -> str:
-        return self.converted_program.exported_program.graph_module
+        return self.converted_graph
 
 
 @register_stage
@@ -274,12 +275,11 @@ def __init__(
         self.inputs = inputs
         self.stages: Dict[str, Stage] = OrderedDict.fromkeys(list(_stages_.keys()))
         self.pipeline = {
+            self._stage_name(Quantize2): [self._stage_name(Export)],
             self._stage_name(Quantize): [self._stage_name(Export)],
             self._stage_name(Export): [
-                self._stage_name(Quantize2),
                 self._stage_name(ToEdge),
             ],
-            self._stage_name(Quantize2): [self._stage_name(ToEdge)],
             self._stage_name(ToEdge): [self._stage_name(Partition)],
             # TODO Make this Stage optional
             self._stage_name(Partition): [self._stage_name(ToExecutorch)],
diff --git a/backends/xnnpack/utils/configs.py b/backends/xnnpack/utils/configs.py
index b6114f939d9..653e11a9746 100644
--- a/backends/xnnpack/utils/configs.py
+++ b/backends/xnnpack/utils/configs.py
@@ -36,4 +36,8 @@ def get_xnnpack_capture_config(dynamic_shape=False, enable_aot: Optional[bool] =
     if enable_aot is None:
         return CaptureConfig(enable_dynamic_shape=dynamic_shape)
     else:
-        return CaptureConfig(enable_dynamic_shape=dynamic_shape, enable_aot=enable_aot)
+        return CaptureConfig(
+            enable_dynamic_shape=dynamic_shape,
+            enable_aot=enable_aot,
+            _unlift=enable_aot,
+        )