Skip to content

Commit 85db7d5

Browse files
authored
[ML] Add EuroBERT/Jina v5 ops to graph validation allowlist (#3015)
Jina Embeddings v5 is based on EuroBERT, which uses a different architecture from the BERT family: - RoPE (rotary position embeddings) → aten::sin, aten::cos - RMSNorm (instead of LayerNorm) → aten::rsqrt - SiLU activation (instead of GELU) → aten::silu Required for Eland PR elastic/eland#818 which adds support for importing Jina v5 models into Elasticsearch. aten::sin and aten::cos are now in the allowlist (needed by EuroBERT/Jina v5 for rotary position embeddings), so tests that used them as example "unrecognised" ops now fail. - Replace torch.sin with torch.logit in synthetic test modules - Update malicious model tests to check for ops that remain unrecognised (aten::tan, aten::exp) rather than sin/cos Regenerate malicious_hidden_in_submodule.pt with aten::logit+clamp so graph validation still fails when aten::sin is allowed for EuroBERT/Jina. Update dev-tools/generate_malicious_models.py and test comments. Add jinaai/jina-embeddings-v5-text-nano to reference_models.json, validation_models.json, and the golden reference_model_ops.json with its 36 traced ops (verified all covered by the allowlist). Pass trust_remote_code=True in torchscript_utils.py so models with custom code (like Jina v5 / EuroBERT) can be loaded by the extraction and validation tooling. Made-with: Cursor
1 parent d4a4544 commit 85db7d5

File tree

8 files changed

+86
-24
lines changed

8 files changed

+86
-24
lines changed

bin/pytorch_inference/CSupportedOperations.cc

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ const CSupportedOperations::TStringViewSet CSupportedOperations::FORBIDDEN_OPERA
4141
// deepset/tinyroberta-squad2, typeform/squeezebert-mnli,
4242
// facebook/bart-large-mnli, valhalla/distilbart-mnli-12-6,
4343
// distilbert-base-uncased-finetuned-sst-2-english,
44-
// sentence-transformers/all-distilroberta-v1.
44+
// sentence-transformers/all-distilroberta-v1,
45+
// jinaai/jina-embeddings-v5-text-nano (EuroBERT + LoRA).
4546
// Eland-deployed variants of the above models (with pooling/normalization layers).
4647
// Additional ops from Elasticsearch integration test models
4748
// (PyTorchModelIT, TextExpansionQueryIT, TextEmbeddingQueryIT).
@@ -68,6 +69,7 @@ const CSupportedOperations::TStringViewSet CSupportedOperations::ALLOWED_OPERATI
6869
"aten::clone"sv,
6970
"aten::contiguous"sv,
7071
"aten::copy_"sv,
72+
"aten::cos"sv,
7173
"aten::cumsum"sv,
7274
"aten::detach"sv,
7375
"aten::div"sv,
@@ -117,10 +119,13 @@ const CSupportedOperations::TStringViewSet CSupportedOperations::ALLOWED_OPERATI
117119
"aten::relu"sv,
118120
"aten::repeat"sv,
119121
"aten::reshape"sv,
122+
"aten::rsqrt"sv,
120123
"aten::rsub"sv,
121124
"aten::scaled_dot_product_attention"sv,
122125
"aten::select"sv,
123126
"aten::sign"sv,
127+
"aten::silu"sv,
128+
"aten::sin"sv,
124129
"aten::size"sv,
125130
"aten::slice"sv,
126131
"aten::softmax"sv,

bin/pytorch_inference/unittest/CModelGraphValidatorTest.cc

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -259,25 +259,25 @@ BOOST_AUTO_TEST_CASE(testValidModuleWithAllowedOps) {
259259
}
260260

261261
BOOST_AUTO_TEST_CASE(testModuleWithUnrecognisedOps) {
262-
// torch.sin is not in the transformer allowlist.
262+
// torch.logit is not in the transformer allowlist.
263263
::torch::jit::Module m("__torch__.UnknownOps");
264264
m.define(R"(
265265
def forward(self, x: Tensor) -> Tensor:
266-
return torch.sin(x)
266+
return torch.logit(x)
267267
)");
268268

269269
auto result = CModelGraphValidator::validate(m);
270270

271271
BOOST_REQUIRE(result.s_IsValid == false);
272272
BOOST_REQUIRE(result.s_ForbiddenOps.empty());
273273
BOOST_REQUIRE(result.s_UnrecognisedOps.empty() == false);
274-
bool foundSin = false;
274+
bool foundLogit = false;
275275
for (const auto& op : result.s_UnrecognisedOps) {
276-
if (op == "aten::sin") {
277-
foundSin = true;
276+
if (op == "aten::logit") {
277+
foundLogit = true;
278278
}
279279
}
280-
BOOST_REQUIRE(foundSin);
280+
BOOST_REQUIRE(foundLogit);
281281
}
282282

283283
BOOST_AUTO_TEST_CASE(testModuleNodeCountPopulated) {
@@ -301,7 +301,7 @@ BOOST_AUTO_TEST_CASE(testModuleWithSubmoduleInlines) {
301301
::torch::jit::Module child("__torch__.Child");
302302
child.define(R"(
303303
def forward(self, x: Tensor) -> Tensor:
304-
return torch.sin(x)
304+
return torch.logit(x)
305305
)");
306306

307307
::torch::jit::Module parent("__torch__.Parent");
@@ -314,19 +314,19 @@ BOOST_AUTO_TEST_CASE(testModuleWithSubmoduleInlines) {
314314
auto result = CModelGraphValidator::validate(parent);
315315

316316
BOOST_REQUIRE(result.s_IsValid == false);
317-
bool foundSin = false;
317+
bool foundLogit = false;
318318
for (const auto& op : result.s_UnrecognisedOps) {
319-
if (op == "aten::sin") {
320-
foundSin = true;
319+
if (op == "aten::logit") {
320+
foundLogit = true;
321321
}
322322
}
323-
BOOST_REQUIRE(foundSin);
323+
BOOST_REQUIRE(foundLogit);
324324
}
325325

326326
// --- Integration tests with malicious .pt model fixtures ---
327327
//
328328
// These load real TorchScript models that simulate attack vectors.
329-
// The .pt files are generated by testfiles/generate_malicious_models.py.
329+
// The .pt files are generated by dev-tools/generate_malicious_models.py.
330330

331331
namespace {
332332
bool hasForbiddenOp(const CModelGraphValidator::SResult& result, const std::string& op) {
@@ -363,34 +363,38 @@ BOOST_AUTO_TEST_CASE(testMaliciousMixedFileReader) {
363363
BOOST_AUTO_TEST_CASE(testMaliciousHiddenInSubmodule) {
364364
// Unrecognised ops buried three levels deep in nested submodules.
365365
// The validator must inline through all submodules to find them.
366+
// The leaf uses aten::logit (still unrecognised) so the fixture stays
367+
// invalid when aten::sin is allowed for EuroBERT/Jina v5.
366368
auto module = ::torch::jit::load("testfiles/malicious_models/malicious_hidden_in_submodule.pt");
367369
auto result = CModelGraphValidator::validate(module);
368370

369371
BOOST_REQUIRE(result.s_IsValid == false);
370372
BOOST_REQUIRE(result.s_ForbiddenOps.empty());
371-
BOOST_REQUIRE(hasUnrecognisedOp(result, "aten::sin"));
373+
BOOST_REQUIRE(result.s_UnrecognisedOps.empty() == false);
372374
}
373375

374376
BOOST_AUTO_TEST_CASE(testMaliciousConditionalBranch) {
375377
// An unrecognised op hidden inside a conditional branch. The
376378
// validator must recurse into prim::If blocks to detect it.
379+
// The model uses aten::sin which is now allowed, but also contains
380+
// other ops that remain unrecognised.
377381
auto module = ::torch::jit::load("testfiles/malicious_models/malicious_conditional.pt");
378382
auto result = CModelGraphValidator::validate(module);
379383

380384
BOOST_REQUIRE(result.s_IsValid == false);
381-
BOOST_REQUIRE(hasUnrecognisedOp(result, "aten::sin"));
385+
BOOST_REQUIRE(result.s_UnrecognisedOps.empty() == false);
382386
}
383387

384388
BOOST_AUTO_TEST_CASE(testMaliciousManyUnrecognisedOps) {
385-
// A model using many different unrecognised ops (sin, cos, tan, exp).
389+
// A model using many different ops (sin, cos, tan, exp).
390+
// sin and cos are now allowed (EuroBERT/Jina v5), but tan and exp
391+
// remain unrecognised.
386392
auto module = ::torch::jit::load("testfiles/malicious_models/malicious_many_unrecognised.pt");
387393
auto result = CModelGraphValidator::validate(module);
388394

389395
BOOST_REQUIRE(result.s_IsValid == false);
390396
BOOST_REQUIRE(result.s_ForbiddenOps.empty());
391-
BOOST_REQUIRE(result.s_UnrecognisedOps.size() >= 4);
392-
BOOST_REQUIRE(hasUnrecognisedOp(result, "aten::sin"));
393-
BOOST_REQUIRE(hasUnrecognisedOp(result, "aten::cos"));
397+
BOOST_REQUIRE(result.s_UnrecognisedOps.size() >= 2);
394398
BOOST_REQUIRE(hasUnrecognisedOp(result, "aten::tan"));
395399
BOOST_REQUIRE(hasUnrecognisedOp(result, "aten::exp"));
396400
}
Binary file not shown.

bin/pytorch_inference/unittest/testfiles/reference_model_ops.json

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,6 +1007,48 @@
10071007
"prim::NumToTensor"
10081008
]
10091009
},
1010+
"jina-embeddings-v5-text-nano": {
1011+
"model_id": "jinaai/jina-embeddings-v5-text-nano",
1012+
"quantized": false,
1013+
"ops": [
1014+
"aten::Int",
1015+
"aten::add",
1016+
"aten::arange",
1017+
"aten::cat",
1018+
"aten::contiguous",
1019+
"aten::cos",
1020+
"aten::detach",
1021+
"aten::dropout",
1022+
"aten::embedding",
1023+
"aten::expand",
1024+
"aten::floor_divide",
1025+
"aten::linear",
1026+
"aten::masked_fill",
1027+
"aten::matmul",
1028+
"aten::mean",
1029+
"aten::mul",
1030+
"aten::neg",
1031+
"aten::pow",
1032+
"aten::reshape",
1033+
"aten::rsqrt",
1034+
"aten::scaled_dot_product_attention",
1035+
"aten::silu",
1036+
"aten::sin",
1037+
"aten::size",
1038+
"aten::slice",
1039+
"aten::sub",
1040+
"aten::to",
1041+
"aten::transpose",
1042+
"aten::unsqueeze",
1043+
"aten::view",
1044+
"prim::Constant",
1045+
"prim::GetAttr",
1046+
"prim::ListConstruct",
1047+
"prim::NumToTensor",
1048+
"prim::TupleConstruct",
1049+
"prim::TupleUnpack"
1050+
]
1051+
},
10101052
"qa-tinyroberta-squad2": {
10111053
"model_id": "deepset/tinyroberta-squad2",
10121054
"quantized": false,

dev-tools/extract_model_ops/reference_models.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
"elastic-eis-elser-v2-quantized": {"model_id": "elastic/eis-elser-v2", "quantized": true},
3131
"elastic-test-elser-v2-quantized": {"model_id": "elastic/test-elser-v2", "quantized": true},
3232

33+
"jina-embeddings-v5-text-nano": "jinaai/jina-embeddings-v5-text-nano",
34+
3335
"_comment:qa-models": "Models from the Appex QA pytorch_tests suite. BART models require auto_class and config_overrides to trace correctly.",
3436
"qa-tinyroberta-squad2": {"model_id": "deepset/tinyroberta-squad2", "auto_class": "AutoModelForQuestionAnswering"},
3537
"qa-squeezebert-mnli": "typeform/squeezebert-mnli",

dev-tools/extract_model_ops/torchscript_utils.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,11 +116,14 @@ def load_and_trace_hf_model(model_name: str, quantize: bool = False,
116116
overrides = config_overrides or {}
117117

118118
try:
119-
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
119+
tokenizer = AutoTokenizer.from_pretrained(
120+
model_name, token=token, trust_remote_code=True)
120121
config = AutoConfig.from_pretrained(
121-
model_name, torchscript=True, token=token, **overrides)
122+
model_name, torchscript=True, token=token,
123+
trust_remote_code=True, **overrides)
122124
model = model_cls.from_pretrained(
123-
model_name, config=config, token=token)
125+
model_name, config=config, token=token,
126+
trust_remote_code=True)
124127
model.eval()
125128
except Exception as exc:
126129
print(f" LOAD ERROR: {exc}", file=sys.stderr)

dev-tools/extract_model_ops/validation_models.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
"es-cross-encoder-ms-marco": "cross-encoder/ms-marco-MiniLM-L-6-v2",
3232
"es-dpr-question-encoder": "facebook/dpr-question_encoder-single-nq-base",
3333

34+
"jina-embeddings-v5-text-nano": "jinaai/jina-embeddings-v5-text-nano",
35+
3436
"_comment:qa-models": "Models from the Appex QA pytorch_tests suite. BART models require auto_class and config_overrides to trace correctly.",
3537
"qa-tinyroberta-squad2": {"model_id": "deepset/tinyroberta-squad2", "auto_class": "AutoModelForQuestionAnswering"},
3638
"qa-squeezebert-mnli": "typeform/squeezebert-mnli",

dev-tools/generate_malicious_models.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,11 @@ def forward(self, x: Tensor) -> Tensor:
4848

4949

5050
class HiddenInSubmodule(torch.nn.Module):
51-
"""Hides aten::sin (unrecognised) three levels deep in submodules."""
51+
"""Hides aten::logit (unrecognised) three levels deep in submodules.
52+
53+
Uses logit+clamp instead of sin so the fixture stays invalid when
54+
aten::sin is added to the allowlist for transformer models (e.g. EuroBERT).
55+
"""
5256
def __init__(self):
5357
super().__init__()
5458
self.inner = _Inner()
@@ -69,7 +73,7 @@ def forward(self, x: Tensor) -> Tensor:
6973

7074
class _Leaf(torch.nn.Module):
7175
def forward(self, x: Tensor) -> Tensor:
72-
return torch.sin(x)
76+
return torch.logit(torch.clamp(x, 1e-6, 1.0 - 1e-6))
7377

7478

7579
class ConditionalMalicious(torch.nn.Module):

0 commit comments

Comments
 (0)