Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions examples/plot_hf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
pickle.dump(model, file=f)

local_repo = mkdtemp(prefix="skops")
hf_hub.init(model=pkl_name, requirements=["scikit-learn"], destination=local_repo)
hf_hub.init(model=pkl_name, requirements=["scikit-learn"], dst=local_repo)

# %%
# We can no see what the contents of the created local repo are:
Expand All @@ -86,10 +86,25 @@
# ===========
# And finally, we can push the model to the hub. This requires a user access
# token which you can get under https://huggingface.co/settings/tokens

# you can put your own token here, or set it as an environment variable before
# running this script.
token = os.environ["HF_HUB_TOKEN"]

repo_name = f"hf_hub_example-{uuid4()}"
# you can put your own token here.
MY_TOKEN = os.environ["HF_HUB_TOKEN"]
hf_hub.push(repo_id=repo_name, source=local_repo, token=MY_TOKEN)
user_name = HfApi().whoami(token=token)["name"]
repo_id = f"{user_name}/{repo_name}"

# Now we can push our files to the repo. The following function creates the
# remote repository if it doesn't exist; this is controlled via the
# ``create_remote`` argument.
hf_hub.push(
repo_id=repo_id,
source=local_repo,
token=token,
commit_message="pushing files to the repo from the example!",
create_remote=True,
)

# %%
# Now you can check the contents of the repository under your user.
Expand All @@ -110,4 +125,4 @@
# ``HfApi().delete_repo``. For more information please refer to the
# documentation of ``huggingface_hub`` library.

HfApi().delete_repo(repo_id=repo_name, token=MY_TOKEN)
HfApi().delete_repo(repo_id=repo_id, token=token)
37 changes: 21 additions & 16 deletions skops/_min_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,24 @@

# 'build' and 'install' is included to have structured metadata for CI.
# It will NOT be included in setup's extras_require
# The values are (version_spec, comma seperated tags)
# The values are (version_spec, comma separated tags, condition)
# tags can be: 'build', 'install', 'docs', 'examples', 'tests', 'benchmark'
# example:
# "tomli": ("1.1.0", "install", "python_full_version < '3.11.0a7'"),
dependent_packages = {
"scikit-learn": ("0.24", "install"),
"huggingface_hub": ("0.5", "install"),
"pytest": (PYTEST_MIN_VERSION, "tests"),
"pytest-cov": ("2.9.0", "tests"),
"flake8": ("3.8.2", "tests"),
"mypy": ("0.770", "tests"),
"sphinx": ("3.2.0", "docs"),
"sphinx-gallery": ("0.7.0", "docs"),
"sphinx-rtd-theme": ("1", "docs"),
"numpydoc": ("1.0.0", "docs"),
"sphinx-prompt": ("1.3.0", "docs"),
"matplotlib": ("3.3", "docs"),
"pandas": ("1", "docs"),
"scikit-learn": ("0.24", "install", None),
"huggingface_hub": ("0.5", "install", None),
"pytest": (PYTEST_MIN_VERSION, "tests", None),
"pytest-cov": ("2.9.0", "tests", None),
"flake8": ("3.8.2", "tests", None),
"mypy": ("0.770", "tests", None),
"sphinx": ("3.2.0", "docs", None),
"sphinx-gallery": ("0.7.0", "docs", None),
"sphinx-rtd-theme": ("1", "docs", None),
"numpydoc": ("1.0.0", "docs", None),
"sphinx-prompt": ("1.3.0", "docs", None),
"matplotlib": ("3.3", "docs", None),
"pandas": ("1", "docs", None),
}


Expand All @@ -29,9 +31,12 @@
extra: []
for extra in ["build", "install", "docs", "examples", "tests", "benchmark"]
}
for package, (min_version, extras) in dependent_packages.items():
for package, (min_version, extras, condition) in dependent_packages.items():
for extra in extras.split(", "):
tag_to_packages[extra].append("{}>={}".format(package, min_version))
spec = f"{package}>={min_version}"
if condition:
spec += f"; {condition}"
tag_to_packages[extra].append(spec)


# Used by CI to get the min dependencies
Expand Down
121 changes: 112 additions & 9 deletions skops/hf_hub/_hf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,26 @@
hub.
"""

import collections
import json
import shutil
from pathlib import Path
from typing import List, Union

from huggingface_hub import HfApi
from requests import HTTPError


def _validate_folder(path: Union[str, Path]):
"""Validate the contents of a folder.

This function checks if the contents of a folder make a valid repo for a
scikit-learn based repo on the HuggingFace Hub.

A valid repository is one which is understood by the Hub as well as this
library to run and use the model. Otherwise anything can be put as a model
repository on the Hub and use it as a `git` and `git lfs` server.

Raises a ``TypeError`` if invalid.

Parameters
Expand All @@ -24,12 +34,62 @@ def _validate_folder(path: Union[str, Path]):
-------
None
"""
pass
path = Path(path)
if not path.is_dir():
raise TypeError("The given path is not a directory.")

config_path = path / "config.json"
if not config_path.exists():
raise TypeError("Configuration file `config.json` missing.")

def init(
*, model: Union[str, Path], requirements: List[str], destination: Union[str, Path]
):
with open(config_path, "r") as f:
config = json.load(f)

model_path = config.get("sklearn", {}).get("model", {}).get("file", None)
if not model_path:
raise TypeError(
"Model file not configured in the configuration file. It should be stored"
" in the hf_hub.sklearn.model key."
)

if not (path / model_path).exists():
raise TypeError(f"Model file {model_path} does not exist.")


def _create_config(*, model_path: str, requirements: List[str], dst: str):
"""Write the configuration into a `config.json` file.

Parameters
----------
model_path : str
The relative path (from the repo root) to the model file.

requirements : list of str
A list of required packages. The versions are then extracted from the
current environment.

dst : str, or Path
The path to an existing folder where the config file should be created.

Returns
-------
None
"""
# so that we don't have to explicitly add keys and they're added as a
# dictionary if they are not found
# see: https://stackoverflow.com/a/13151294/2536294
def recursively_default_dict():
return collections.defaultdict(recursively_default_dict)

config = recursively_default_dict()
config["sklearn"]["model"]["file"] = model_path
config["sklearn"]["environment"] = requirements

with open(Path(dst) / "config.json", mode="w") as f:
json.dump(config, f, sort_keys=True, indent=4)


def init(*, model: Union[str, Path], requirements: List[str], dst: Union[str, Path]):
"""Initialize a scikit-learn based HuggingFace repo.

Given a model pickle and a set of required packages, this function
Expand All @@ -44,14 +104,22 @@ def init(
A list of required packages. The versions are then extracted from the
current environment.

destination: str, or Path
The path to a non-existing folder which is to be initializes.
dst: str, or Path
The path to a non-existing or empty folder which is to be initialized.

Returns
-------
None
"""
pass
dst = Path(dst)
if dst.exists() and next(dst.iterdir(), None):
raise OSError("None-empty dst path already exists!")
dst.mkdir(parents=True, exist_ok=True)

shutil.copy2(src=model, dst=dst)

model_name = Path(model).name
_create_config(model_path=model_name, requirements=requirements, dst=dst)


def update_env(*, path: Union[str, Path], requirements: List[str] = None):
Expand All @@ -76,7 +144,14 @@ def update_env(*, path: Union[str, Path], requirements: List[str] = None):
pass


def push(*, repo_id: str, source: Union[str, Path], token: str = None):
def push(
*,
repo_id: str,
source: Union[str, Path],
token: str = None,
commit_message: str = None,
create_remote: bool = False,
):
"""Pushes the contents of a model repo to HuggingFace Hub.

This function validates the contents of the folder before pushing it to the
Expand All @@ -94,6 +169,15 @@ def push(*, repo_id: str, source: Union[str, Path], token: str = None):
A token to push to the hub. If not provided, the user should be already
logged in using ``huggingface-cli login``.

commit_message: str, optional
The commit message to be used when pushing to the repo.

create_remote: bool, optional
Whether to create the remote repository if it doesn't exist. If the
remote repository doesn't exist and this parameter is ``False``, it
raises an error. Otherwise it checks if the remote repository exists,
and would create it if it doesn't.
Comment on lines +175 to +179
Copy link
Collaborator

@merveenoyan merveenoyan Jun 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this exists, why do we create_repo in examples and tests? Wouldn't it cause confusion?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed it from the example, but the test needs to test different scenarios, and the user might create the repo before calling this method, so the test makes sure that case is tested as well.


Returns
-------
None
Expand All @@ -103,4 +187,23 @@ def push(*, repo_id: str, source: Union[str, Path], token: str = None):
This function raises a ``TypeError`` if the contents of the source folder
do not make a valid HuggingFace Hub scikit-learn based repo.
"""
pass
_validate_folder(path=source)
client = HfApi()

if create_remote:
try:
client.model_info(repo_id=repo_id, token=token)
except HTTPError:
client.create_repo(repo_id=repo_id, token=token, repo_type="model")

client.upload_folder(
repo_id=repo_id,
path_in_repo=".",
folder_path=source,
commit_message=commit_message,
commit_description=None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this needed? 😅

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now I'm not exposing this to the end user, but passing it here. Passing it explicitly means if huggingface_hub decides to change the default value of this parameter, our users wouldn't notice anything cause we're already explicitly setting it anyway.

token=token,
repo_type=None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is by default None and None refers to model BTW

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, same as above, I'm just explicitly setting it in case in the future the default value changes.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah gotcha!

revision=None,
create_pr=False,
)
10 changes: 10 additions & 0 deletions skops/hf_hub/tests/sample_repo/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"sklearn": {
"environment": [
"scikit-learn=\"1.1.1\""
],
"model": {
"file": "model.pkl"
}
}
}
Empty file.
Loading