Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ jobs:
- name: Tests
env:
SUPER_SECRET: ${{ secrets.HF_HUB_TOKEN }}
PYTHONIOENCODING: "utf-8"
run: |
python -m pytest -s -v --cov-report=xml -m "not inference" skops/

Expand Down
2 changes: 2 additions & 0 deletions docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ v0.7
- `compression` and `compresslevel` from :class:`~zipfile.ZipFile` are now
exposed to the user via :func:`.io.dumps` and :func:`.io.dump`. :pr:`345` by
`Adrin Jalali`_.
- Fix: :func:`skops.io.visualize` is now capable of showing bytes. :pr:`352` by
`Benjamin Bossan`_.

v0.6
----
Expand Down
12 changes: 12 additions & 0 deletions skops/io/_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import operator
import uuid
from functools import partial
from reprlib import Repr
from types import FunctionType, MethodType
from typing import Any, Sequence

Expand All @@ -27,6 +28,9 @@
)
from .exceptions import UnsupportedTypeException

arepr = Repr()
arepr.maxstring = 24


def dict_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
res = {
Expand Down Expand Up @@ -527,6 +531,11 @@ def _construct(self):
content = self.children["content"].getvalue()
return content

def format(self):
content = self.children["content"].getvalue()
byte_repr = arepr.repr(content)
return byte_repr


class BytearrayNode(BytesNode):
def __init__(
Expand All @@ -543,6 +552,9 @@ def _construct(self):
content_bytearray = bytearray(list(content_bytes))
return content_bytearray

def format(self):
return f"bytearray({super().format()})"


def operator_func_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
_, attrs = obj.__reduce__()
Expand Down
14 changes: 12 additions & 2 deletions skops/io/_visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,21 @@
from zipfile import ZipFile

from ._audit import VALID_NODE_CHILD_TYPES, Node, get_tree
from ._general import FunctionNode, JsonNode, ListNode
from ._general import BytearrayNode, BytesNode, FunctionNode, JsonNode, ListNode
from ._numpy import NdArrayNode
from ._scipy import SparseMatrixNode
from ._utils import LoadContext

# The children of these types are not visualized
SKIPPED_TYPES = (
BytearrayNode,
BytesNode,
FunctionNode,
JsonNode,
NdArrayNode,
SparseMatrixNode,
)


@dataclass
class NodeInfo:
Expand Down Expand Up @@ -269,7 +279,7 @@ def walk_tree(
# TODO: For better security, we should check the schema if we return early,
# otherwise something nefarious could be hidden inside (however, if there
# is, the node should be marked as unsafe)
if isinstance(node, (NdArrayNode, SparseMatrixNode, FunctionNode, JsonNode)):
if isinstance(node, SKIPPED_TYPES):
return

yield from walk_tree(
Expand Down
70 changes: 58 additions & 12 deletions skops/io/tests/test_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,20 @@

Packages that are not builtins, standard lib, numpy, scipy, or scikit-learn.

Testing:

- persistence of unfitted models
- persistence of fitted models
- visualization of dumped models

with a range of hyperparameters.

"""

import pytest
from sklearn.datasets import make_classification, make_regression

from skops.io import dumps, loads
from skops.io import dumps, loads, visualize
from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal

# Default settings for generated data
Expand Down Expand Up @@ -46,6 +54,11 @@ def rank_data(clf_data):
return X, y, group


def _null(*args, **kwargs):
# used to prevent printing anything to stdout when calling visualize
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

passing this as sink is disabling a fair amount of code, not just printing. Are you sure you want this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is true. The relevant part of the code is executed, though, i.e. the tests would fail without the bugfix. If you want me to still change it, I can think of something (I guess auto fixture that overrides sys.stdout, not sure if it conflicts with pytest).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, but you're also specifically testing this bug anyway. So it'd be nice to have visualize to actually test the whole thing, in case later we add something which would break things.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, makes sense, done.

return


class TestLightGBM:
"""Tests for LGBMClassifier, LGBMRegressor, LGBMRanker"""

Expand Down Expand Up @@ -83,9 +96,12 @@ def test_classifier(self, lgbm, clf_data, trusted, boosting_type):

X, y = clf_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted, sink=_null)

@pytest.mark.parametrize("boosting_type", boosting_types)
def test_regressor(self, lgbm, regr_data, trusted, boosting_type):
kw = {}
Expand All @@ -99,9 +115,12 @@ def test_regressor(self, lgbm, regr_data, trusted, boosting_type):

X, y = regr_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted, sink=_null)

@pytest.mark.parametrize("boosting_type", boosting_types)
def test_ranker(self, lgbm, rank_data, trusted, boosting_type):
kw = {}
Expand All @@ -115,9 +134,12 @@ def test_ranker(self, lgbm, rank_data, trusted, boosting_type):

X, y, group = rank_data
estimator.fit(X, y, group=group)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted, sink=_null)


class TestXGBoost:
"""Tests for XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor, XGBRanker
Expand Down Expand Up @@ -170,9 +192,12 @@ def test_classifier(self, xgboost, clf_data, trusted, booster, tree_method):

X, y = clf_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted, sink=_null)

@pytest.mark.parametrize("booster", boosters)
@pytest.mark.parametrize("tree_method", tree_methods)
def test_regressor(self, xgboost, regr_data, trusted, booster, tree_method):
Expand All @@ -186,9 +211,12 @@ def test_regressor(self, xgboost, regr_data, trusted, booster, tree_method):

X, y = regr_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted, sink=_null)

@pytest.mark.parametrize("booster", boosters)
@pytest.mark.parametrize("tree_method", tree_methods)
def test_rf_classifier(self, xgboost, clf_data, trusted, booster, tree_method):
Expand All @@ -202,9 +230,12 @@ def test_rf_classifier(self, xgboost, clf_data, trusted, booster, tree_method):

X, y = clf_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted, sink=_null)

@pytest.mark.parametrize("booster", boosters)
@pytest.mark.parametrize("tree_method", tree_methods)
def test_rf_regressor(self, xgboost, regr_data, trusted, booster, tree_method):
Expand All @@ -218,9 +249,12 @@ def test_rf_regressor(self, xgboost, regr_data, trusted, booster, tree_method):

X, y = regr_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted, sink=_null)

@pytest.mark.parametrize("booster", boosters)
@pytest.mark.parametrize("tree_method", tree_methods)
def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method):
Expand All @@ -234,9 +268,12 @@ def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method):

X, y, group = rank_data
estimator.fit(X, y, group=group)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted, sink=_null)


class TestCatboost:
"""Tests for CatBoostClassifier, CatBoostRegressor, and CatBoostRanker"""
Expand Down Expand Up @@ -290,9 +327,12 @@ def test_classifier(self, catboost, cb_clf_data, trusted, boosting_type):

X, y = cb_clf_data
estimator.fit(X, y, cat_features=[0, 1])
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted, sink=_null)

@pytest.mark.parametrize("boosting_type", boosting_types)
def test_regressor(self, catboost, cb_regr_data, trusted, boosting_type):
estimator = catboost.CatBoostRegressor(
Expand All @@ -303,9 +343,12 @@ def test_regressor(self, catboost, cb_regr_data, trusted, boosting_type):

X, y = cb_regr_data
estimator.fit(X, y, cat_features=[0, 1])
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted, sink=_null)

@pytest.mark.parametrize("boosting_type", boosting_types)
def test_ranker(self, catboost, cb_rank_data, trusted, boosting_type):
estimator = catboost.CatBoostRanker(
Expand All @@ -316,5 +359,8 @@ def test_ranker(self, catboost, cb_rank_data, trusted, boosting_type):

X, y, group_id = cb_rank_data
estimator.fit(X, y, cat_features=[0, 1], group_id=group_id)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted, sink=_null)
20 changes: 20 additions & 0 deletions skops/io/tests/test_visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,3 +269,23 @@ def test_from_file(self, simple, tmp_path, capsys):
]
stdout, _ = capsys.readouterr()
assert stdout.strip() == "\n".join(expected)

def test_long_bytes(self, capsys):
obj = {
"short_byte": b"abc",
"long_byte": b"010203040506070809101112131415",
"short_bytearray": bytearray(b"abc"),
"long_bytearray": bytearray(b"010203040506070809101112131415"),
}
dumped = sio.dumps(obj)
sio.visualize(dumped)

expected = [
"root: builtins.dict",
"├── short_byte: b'abc'",
"├── long_byte: b'01020304050...9101112131415'",
"├── short_bytearray: bytearray(b'abc')",
"└── long_bytearray: bytearray(b'01020304050...9101112131415')",
]
stdout, _ = capsys.readouterr()
assert stdout.strip() == "\n".join(expected)