Fix TPU compatibility for model tests

jenriver · chapman20j · commit 63388b652a4c · 2025-11-25T10:29:16.000-08:00
This commit resolves `BufferError` and assertion failures when running
tests on TPU for qwen3, densenet121, vit

Key changes:
- Fix DLPack BufferError: use np during JAX-to-PyTorch conversion (`torch.tensor(np.array(jax_array))`) to force data transfer from TPU to CPU host memory, bypassing DLPack transfer unsupported between TPU and PyTorch CPU.
- Enforce Float32 Precision: Explicitly cast JAX model weights to `float32` and set `jax_default_matmul_precision` to `float32`. Prevent implicit downcasting to `bfloat16` on TPU.

These changes ensure consistent test behavior across CPU and TPU environments.
diff --git a/bonsai/models/densenet121/tests/test_outputs_densenet121.py b/bonsai/models/densenet121/tests/test_outputs_densenet121.py
@@ -18,6 +18,7 @@
 import numpy as np
 import tensorflow as tf
 from absl.testing import absltest
+from flax import nnx
 from huggingface_hub import snapshot_download
 
 from bonsai.models.densenet121 import modeling, params
@@ -26,10 +27,16 @@
 class TestModuleForwardPasses(absltest.TestCase):
     def setUp(self):
         super().setUp()
+        jax.config.update("jax_default_matmul_precision", "float32")
         try:
             self.ref_model = keras_hub.models.ImageClassifier.from_preset("densenet_121_imagenet")
             model_ckpt_path = snapshot_download("keras/densenet_121_imagenet")
-            self.nnx_model = params.create_model_from_h5(model_ckpt_path, modeling.ModelConfig.densenet_121())
+            graph_def, state = nnx.split(
+                params.create_model_from_h5(model_ckpt_path, modeling.ModelConfig.densenet_121())
+            )
+            state = jax.tree.map(lambda x: x.astype(jnp.float32) if isinstance(x, jax.Array) else x, state)
+            self.nnx_model = nnx.merge(graph_def, state)
+
         except Exception as e:
             self.skipTest(
                 "Skipping test because tensorflow-text requires 3.12 or below: %s"
diff --git a/bonsai/models/qwen3/tests/test_outputs_qwen3.py b/bonsai/models/qwen3/tests/test_outputs_qwen3.py
@@ -1,7 +1,9 @@
 import jax
 import jax.numpy as jnp
+import numpy as np
 import torch
 from absl.testing import absltest
+from flax import nnx
 from huggingface_hub import snapshot_download
 from jax.sharding import AxisType
 from jax.sharding import PartitionSpec as P
@@ -18,6 +20,7 @@
 class TestModuleForwardPasses(absltest.TestCase):
     def setUp(self):
         super().setUp()
+        jax.config.update("jax_default_matmul_precision", "float32")
         model_name: str = "Qwen/Qwen3-0.6B"
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
 
@@ -27,7 +30,13 @@ def setUp(self):
         model_ckpt_path = snapshot_download("Qwen/Qwen3-0.6B")
         self.mesh = jax.make_mesh(((1, 1)), ("fsdp", "tp"), axis_types=(AxisType.Explicit, AxisType.Explicit))
         jax.set_mesh(self.mesh)
-        self.nnx_model = params.create_model_from_safe_tensors(model_ckpt_path, self.bonsai_config, self.mesh)
+
+        # Cast JAX model to float32 for precision matching with PyTorch CPU
+        graph_def, state = nnx.split(
+            params.create_model_from_safe_tensors(model_ckpt_path, self.bonsai_config, self.mesh)
+        )
+        state = jax.tree.map(lambda x: x.astype(jnp.float32) if isinstance(x, jax.Array) else x, state)
+        self.nnx_model = nnx.merge(graph_def, state)
 
         self.batch_size = 32
         self.num_input_tokens = 5
@@ -39,7 +48,11 @@ def _check_batched_logits(self, left_pads: int, torch_logits: torch.Tensor, nnx_
         max_len = torch_logits.shape[-2]
         for lp, tl, nl in zip(left_pads, torch_logits, nnx_logits):
             torch.testing.assert_close(
-                torch.tensor(nl)[lp:max_len, :], tl[lp:, :], rtol=self.relaxed_tol, atol=self.relaxed_tol
+                torch.tensor(np.array(nl, dtype=np.float32))[lp:max_len, :],
+                tl[lp:, :],
+                rtol=self.relaxed_tol,
+                atol=self.relaxed_tol,
+                check_dtype=False,
             )
 
     def _setup_torch_attn(self, input_embeddings: torch.Tensor, attention_mask: None = None):
@@ -122,147 +135,231 @@ def test_embedder(self):
         jx = jnp.array(tx.cpu().detach().numpy())
 
         jy, ty = nm.embedding.value.at[(jx,)].get(), tm(tx)
-        torch.testing.assert_close(torch.tensor(jy), ty)
+        torch.testing.assert_close(
+            torch.tensor(np.array(jy, dtype=np.float32)),
+            ty,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_decoder_layer(self):
         nm = self.nnx_model.layers[0]
         tm = self.torch_model.model.layers[0].to(torch.float32)
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.emb_dim)
         jx = jax.random.normal(jax.random.key(0), shape=shape)
-        tx = torch.tensor(jx)
+        tx = torch.tensor(np.array(jx, dtype=np.float32))
         nnx_cache = self._init_nnx_cache(self.batch_size)
         torch_inputs = self._setup_torch_attn(tx)
 
         jy, ty = nm(jx, nnx_cache[0], jnp.ones((self.batch_size, self.num_input_tokens))), tm(**torch_inputs)
-        torch.testing.assert_close(torch.tensor(jy), ty)
+        torch.testing.assert_close(
+            torch.tensor(np.array(jy, dtype=np.float32)),
+            ty,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_all_decoder_layers(self):
         nnx_cache = self._init_nnx_cache(self.batch_size)
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.emb_dim)
 
         for nm, tm, nc in zip(self.nnx_model.layers, self.torch_model.model.layers, nnx_cache):
             jx = jax.random.normal(jax.random.key(0), shape=shape)
-            tx = torch.tensor(jx)
+            tx = torch.tensor(np.array(jx, dtype=np.float32))
 
             jy = nm(jx, nc, jnp.ones((self.batch_size, self.num_input_tokens)))
             torch_inputs = self._setup_torch_attn(tx)
             ty = tm.to(torch.float32)(**torch_inputs)
-            torch.testing.assert_close(torch.tensor(jy), ty, atol=self.relaxed_tol, rtol=self.relaxed_tol)
+            torch.testing.assert_close(
+                torch.tensor(np.array(jy, dtype=np.float32)),
+                ty,
+                atol=self.relaxed_tol,
+                rtol=self.relaxed_tol,
+                check_dtype=False,
+            )
 
     def test_rms_norm(self):
         nm = self.nnx_model.layers[0].input_layernorm
         tm = self.torch_model.model.layers[0].input_layernorm
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.emb_dim)
-        jx = jax.random.normal(jax.random.key(0), shape=shape, dtype=jnp.bfloat16)
-        tx = torch.tensor(jx)
+        jx = jax.random.normal(jax.random.key(0), shape=shape)
+        tx = torch.tensor(np.array(jx, dtype=np.float32))
 
         jy, ty = nm(jx), tm(tx)
-        torch.testing.assert_close(torch.tensor(jy), ty)
+        torch.testing.assert_close(
+            torch.tensor(np.array(jy, dtype=np.float32)),
+            ty,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_self_attn(self):
         nm = self.nnx_model.layers[0].attn
         tm = self.torch_model.model.layers[0].self_attn.to(torch.float32)
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.emb_dim)
         jx = jax.random.normal(jax.random.key(0), shape=shape)
-        tx = torch.tensor(jx)
+        tx = torch.tensor(np.array(jx, dtype=np.float32))
         torch_inputs = self._setup_torch_attn(tx)
         nnx_cache = self._init_nnx_cache(self.batch_size)
 
         jy = nm(jx, nnx_cache[0], jnp.ones((self.batch_size, self.num_input_tokens), dtype=jnp.float32))
         ty = tm(**torch_inputs)[0]
-        torch.testing.assert_close(torch.tensor(jy), ty)
+        torch.testing.assert_close(
+            torch.tensor(np.array(jy, dtype=np.float32)),
+            ty,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_q_norm(self):
         nm = self.nnx_model.layers[0].attn.q_norm
         tm = self.torch_model.model.layers[0].self_attn.q_norm
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.num_heads, self.bonsai_config.head_dim)
-        jx = jax.random.normal(jax.random.key(0), shape=shape, dtype=jnp.bfloat16)
-        tx = torch.tensor(jx)
+        jx = jax.random.normal(jax.random.key(0), shape=shape)
+        tx = torch.tensor(np.array(jx, dtype=np.float32))
 
         jy, ty = nm(jx), tm(tx)
-        torch.testing.assert_close(torch.tensor(jy), ty)
+        torch.testing.assert_close(
+            torch.tensor(np.array(jy, dtype=np.float32)),
+            ty,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_k_norm(self):
         nm = self.nnx_model.layers[0].attn.q_norm
         tm = self.torch_model.model.layers[0].self_attn.q_norm
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.num_kv_heads, self.bonsai_config.head_dim)
-        jx = jax.random.normal(jax.random.key(0), shape=shape, dtype=jnp.bfloat16)
-        tx = torch.tensor(jx)
+        jx = jax.random.normal(jax.random.key(0), shape=shape)
+        tx = torch.tensor(np.array(jx, dtype=np.float32))
 
         jy, ty = nm(jx), tm(tx)
-        torch.testing.assert_close(torch.tensor(jy), ty)
+        torch.testing.assert_close(
+            torch.tensor(np.array(jy, dtype=np.float32)),
+            ty,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_q_proj(self):
         nm = self.nnx_model.layers[0].attn.q_proj
-        tm = self.torch_model.model.layers[0].self_attn.q_proj
+        tm = self.torch_model.model.layers[0].self_attn.q_proj.to(torch.float32)
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.emb_dim)
-        jx = jax.random.normal(jax.random.key(0), shape=shape, dtype=jnp.bfloat16)
-        tx = torch.tensor(jx)
+        jx = jax.random.normal(jax.random.key(0), shape=shape)
+        tx = torch.tensor(np.array(jx, dtype=np.float32))
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.num_heads, self.bonsai_config.head_dim)
         jy, ty = nm(jx), tm(tx).reshape(shape)
-        torch.testing.assert_close(torch.tensor(jy), ty)
+        torch.testing.assert_close(
+            torch.tensor(np.array(jy, dtype=np.float32)),
+            ty,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_k_proj(self):
         nm = self.nnx_model.layers[0].attn.k_proj
-        tm = self.torch_model.model.layers[0].self_attn.k_proj
+        tm = self.torch_model.model.layers[0].self_attn.k_proj.to(torch.float32)
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.emb_dim)
-        jx = jax.random.normal(jax.random.key(0), shape=shape, dtype=jnp.bfloat16)
-        tx = torch.tensor(jx)
+        jx = jax.random.normal(jax.random.key(0), shape=shape)
+        tx = torch.tensor(np.array(jx, dtype=np.float32))
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.num_kv_heads, self.bonsai_config.head_dim)
         jy, ty = nm(jx), tm(tx).reshape(shape)
-        torch.testing.assert_close(torch.tensor(jy), ty)
+        torch.testing.assert_close(
+            torch.tensor(np.array(jy, dtype=np.float32)),
+            ty,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_o_proj(self):
         nm = self.nnx_model.layers[0].attn.o_proj
-        tm = self.torch_model.model.layers[0].self_attn.o_proj
+        tm = self.torch_model.model.layers[0].self_attn.o_proj.to(torch.float32)
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.num_heads, self.bonsai_config.head_dim)
-        jx = jax.random.normal(jax.random.key(0), shape=shape, dtype=jnp.bfloat16)
-        tx = torch.tensor(jx).reshape(self.batch_size, self.num_input_tokens, -1)
+        jx = jax.random.normal(jax.random.key(0), shape=shape)
+        tx = torch.tensor(np.array(jx, dtype=np.float32)).reshape(self.batch_size, self.num_input_tokens, -1)
 
         jy, ty = nm(jx), tm(tx)
-        torch.testing.assert_close(torch.tensor(jy), ty)
+        torch.testing.assert_close(
+            torch.tensor(np.array(jy, dtype=np.float32)),
+            ty,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_mlp(self):
         nm = self.nnx_model.layers[0].mlp
         tm = self.torch_model.model.layers[0].mlp.to(torch.float32)
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.emb_dim)
         jx = jax.random.normal(jax.random.key(0), shape=shape)
-        tx = torch.tensor(jx)
+        tx = torch.tensor(np.array(jx, dtype=np.float32))
 
         jy, ty = nm(jx), tm(tx)
-        torch.testing.assert_close(torch.tensor(jy), ty, rtol=self.relaxed_tol, atol=self.relaxed_tol)
+        torch.testing.assert_close(
+            torch.tensor(np.array(jy, dtype=np.float32)),
+            ty,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_lm_head(self):
         nm = self.nnx_model.lm_head
         tm = self.torch_model.lm_head.to(torch.float32)
 
         shape = (self.batch_size, self.num_input_tokens, self.bonsai_config.emb_dim)
         jx = jax.random.normal(jax.random.key(0), shape=shape)
-        tx = torch.tensor(jx)
+        tx = torch.tensor(np.array(jx, dtype=np.float32))
 
         jy, ty = nm(jx), tm(tx)
-        torch.testing.assert_close(torch.tensor(jy), ty)
+        torch.testing.assert_close(
+            torch.tensor(np.array(jy, dtype=np.float32)),
+            ty,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_sin_cos(self):
         batch_size, seq_len, dim = 2, 10, 128
         hidden_states = torch.ones((batch_size, seq_len, dim))
         jp = jnp.stack([jnp.arange(seq_len), jnp.arange(seq_len)])
         js, jc = modeling._generate_pos_embeddings(jp, dim)
-        tc, ts = self.torch_model.model.rotary_emb(hidden_states, torch.tensor(jp))
+        tc, ts = self.torch_model.model.rotary_emb(hidden_states, torch.tensor(np.array(jp, dtype=np.float32)))
         tc, ts = tc[:, :, : dim // 2], ts[:, :, : dim // 2]
-        torch.testing.assert_close(torch.tensor(js), ts)
-        torch.testing.assert_close(torch.tensor(jc), tc)
+        torch.testing.assert_close(
+            torch.tensor(np.array(js, dtype=np.float32)),
+            ts,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
+        torch.testing.assert_close(
+            torch.tensor(np.array(jc, dtype=np.float32)),
+            tc,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
+        )
 
     def test_full(self):
         query = ["Why is the sky blue instead of any other color like purple?"]
@@ -275,7 +372,11 @@ def test_full(self):
         torch_inputs = self._process_hf_tokens(query)
         torch_logits = self.torch_model(**torch_inputs).logits
         torch.testing.assert_close(
-            torch.tensor(nnx_logits)[:, :token_len, :], torch_logits, rtol=self.relaxed_tol, atol=self.relaxed_tol
+            torch.tensor(np.array(nnx_logits, dtype=np.float32))[:, :token_len, :],
+            torch_logits,
+            rtol=self.relaxed_tol,
+            atol=self.relaxed_tol,
+            check_dtype=False,
         )
 
     def test_full_batched(self):
diff --git a/bonsai/models/vit/tests/test_outputs_vit.py b/bonsai/models/vit/tests/test_outputs_vit.py