Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/continuous_integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,5 @@ jobs:
- name: Test with pytest
if: ${{ github.event_name == 'merge_group' }}
run: |
mkdir ~/Data
uv run python -m pytest -vv -n 2 --dist loadgroup --cov=ethicml --cov-fail-under=80 tests/
35 changes: 17 additions & 18 deletions ethicml/data/tabular_data/acs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import contextlib
import os
from pathlib import Path
from typing import Literal, TypeAlias, get_args
from typing import Literal, TypeAlias, cast, get_args
from typing_extensions import override

import numpy as np
Expand Down Expand Up @@ -98,12 +98,13 @@ def _download_dir(root: Path) -> Generator[None, None, None]:
class _AcsBase(Dataset):
split: str
target: str
year: Literal["2014", "2015", "2016", "2017", "2018"]

def __init__(
self,
name: str,
root: str | Path,
year: str,
year: Literal["2014", "2015", "2016", "2017", "2018"],
horizon: int,
states: list[StateList],
class_label_spec: str,
Expand All @@ -119,7 +120,7 @@ def __init__(

self.year = year
self.horizon = horizon
self.survey = "person"
self.survey: Literal["person", "household"] = "person"
self.states = states
self._invert_s = invert_s

Expand Down Expand Up @@ -294,7 +295,7 @@ class AcsIncome(_AcsBase):
def __init__(
self,
root: str | Path,
year: str,
year: Literal["2014", "2015", "2016", "2017", "2018"],
horizon: int,
states: list[StateList],
split: str = "Sex",
Expand Down Expand Up @@ -341,9 +342,8 @@ def load(
) -> DataTuple:
from folktables import ACSDataSource, adult_filter, folktables

datasource = ACSDataSource(
survey_year=self.year, horizon=f"{self.horizon}-Year", survey=self.survey
)
horizon = cast(Literal["1-Year", "5-Year"], f"{self.horizon}-Year")
datasource = ACSDataSource(survey_year=self.year, horizon=horizon, survey=self.survey)

with _download_dir(self.root):
dataframe = datasource.get_data(states=self.states, download=True)
Expand Down Expand Up @@ -374,8 +374,8 @@ def load(
postprocess=lambda x: np.nan_to_num(x, nan=-1),
)

dataframe = data_obj._preprocess(dataframe)
dataframe[data_obj.target] = dataframe[data_obj.target].apply(data_obj._target_transform)
dataframe = data_obj._preprocess(dataframe) # type: ignore[attr-defined]
dataframe[data_obj.target] = dataframe[data_obj.target].apply(data_obj._target_transform) # type: ignore[attr-defined]

for feat in disc_feats:
dataframe[feat] = (
Expand All @@ -388,7 +388,7 @@ def load(

dataframe = pd.get_dummies(dataframe[disc_feats + continuous_features])

dataframe = dataframe.apply(data_obj._postprocess)
dataframe = dataframe.apply(data_obj._postprocess) # type: ignore[attr-defined]

cow_cols = [col for col in dataframe.columns if col.startswith("COW")]
mar_cols = [col for col in dataframe.columns if col.startswith("MAR")]
Expand Down Expand Up @@ -451,7 +451,7 @@ class AcsEmployment(_AcsBase):
def __init__(
self,
root: str | Path,
year: str,
year: Literal["2014", "2015", "2016", "2017", "2018"],
horizon: int,
states: list[StateList],
split: str = "Sex",
Expand Down Expand Up @@ -506,9 +506,8 @@ def load(
) -> DataTuple:
from folktables import ACSDataSource, folktables

datasource = ACSDataSource(
survey_year=self.year, horizon=f"{self.horizon}-Year", survey=self.survey
)
horizon = cast(Literal["1-Year", "5-Year"], f"{self.horizon}-Year")
datasource = ACSDataSource(survey_year=self.year, horizon=horizon, survey=self.survey)

with _download_dir(self.root):
dataframe = datasource.get_data(states=self.states, download=True)
Expand Down Expand Up @@ -545,9 +544,9 @@ def load(
postprocess=lambda x: np.nan_to_num(x, nan=-1),
)

dataframe = data_obj._preprocess(dataframe)
dataframe[data_obj.target] = dataframe[data_obj.target].apply(data_obj._target_transform)
dataframe = dataframe.apply(data_obj._postprocess)
dataframe = data_obj._preprocess(dataframe) # type: ignore[attr-defined]
dataframe[data_obj.target] = dataframe[data_obj.target].apply(data_obj._target_transform) # type: ignore[attr-defined]
dataframe = dataframe.apply(data_obj._postprocess) # type: ignore[attr-defined]

for feat in disc_feats:
dataframe[feat] = (
Expand All @@ -560,7 +559,7 @@ def load(

dataframe = pd.get_dummies(dataframe[disc_feats + continuous_features])

dataframe = dataframe.apply(data_obj._postprocess)
dataframe = dataframe.apply(data_obj._postprocess) # type: ignore[attr-defined]

schl_cols = [col for col in dataframe.columns if col.startswith("SCHL")]
mar_cols = [col for col in dataframe.columns if col.startswith("MAR")]
Expand Down
2 changes: 1 addition & 1 deletion ethicml/implementations/dro_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def train_model(
num_epochs = len(train_loader.dataset) # type: ignore[arg-type]
print(
f"train Epoch: {epoch} [{batch_idx * len(data_x)}/{num_epochs}"
f"\t({100. * batch_idx / len(train_loader):.0f}%)]"
f"\t({100.0 * batch_idx / len(train_loader):.0f}%)]"
f"\tLoss: {loss.item() / len(data_x):.6f}"
)

Expand Down
4 changes: 2 additions & 2 deletions ethicml/implementations/vfae.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def train_model(
if flags["supervised"]:
print(
f"train Epoch: {epoch} [{batch_idx * len(data_x)}/{num_epochs}"
f"({100. * batch_idx / len(train_loader):.0f}%)]\t"
f"({100.0 * batch_idx / len(train_loader):.0f}%)]\t"
f"Loss: {loss.item() / len(data_x):.6f}\t"
f"pred_loss: {prediction_loss.item():.6f}\t"
f"recon_loss: {reconstruction_loss.item():.6f}\t"
Expand All @@ -124,7 +124,7 @@ def train_model(
else:
print(
f"train Epoch: {epoch} [{batch_idx * len(data_x)}/{num_epochs}"
f"({100. * batch_idx / len(train_loader):.0f}%)]\t"
f"({100.0 * batch_idx / len(train_loader):.0f}%)]\t"
f"Loss: {loss.item() / len(data_x):.6f}\t"
f"recon_loss: {reconstruction_loss.item():.6f}\t"
f"mmd_loss: {flags['batch_size'] * mmd_loss.item():.6f}"
Expand Down
6 changes: 3 additions & 3 deletions ethicml/implementations/vfae_modules/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ def __init__(self, enc_size: list[int], init_size: int, ld: int) -> None:
self.encoder.add_module("batch norm 0", nn.BatchNorm1d(enc_size[0]))
for k in range(len(enc_size) - 1):
self.encoder.add_module(
f"encoder layer {k+1}", nn.Linear(enc_size[k], enc_size[k + 1])
f"encoder layer {k + 1}", nn.Linear(enc_size[k], enc_size[k + 1])
)
self.encoder.add_module(f"encoder activation {k+1}", activation)
self.encoder.add_module(f"encoder activation {k + 1}", activation)
self.encoder.add_module(
f"encoder batch norm {k+1}", nn.BatchNorm1d(enc_size[k + 1])
f"encoder batch norm {k + 1}", nn.BatchNorm1d(enc_size[k + 1])
)
self.z1_enc_mu = nn.Linear(enc_size[-1], ld)
self.z1_enc_logvar = nn.Linear(enc_size[-1], ld)
Expand Down
12 changes: 6 additions & 6 deletions ethicml/implementations/zemel.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pandas as pd
import scipy.optimize as optim
from scipy.spatial.distance import cdist
from scipy.special import softmax # type: ignore[attr-defined]
from scipy.special import softmax

from ethicml.implementations.utils import load_data_from_flags, save_transformations
from ethicml.utility import DataTuple, SubgroupTuple, TestTuple
Expand Down Expand Up @@ -39,7 +39,7 @@ def LFR_optim_objective( # noqa: N802, PLR0913
A_z: float, # noqa: N803
print_interval: int,
verbose: bool, # noqa: FBT001 # disabled because scipy needs positional args
) -> np.number:
) -> np.floating:
"""LFR optim objective."""
_, features_dim = x_unprivileged.shape

Expand Down Expand Up @@ -135,8 +135,8 @@ def fit(train: DataTuple, flags: "ZemelArgs", seed: int) -> Model:
bnd = [(0, 1)] * flags["clusters"] + [(None, None)] * features_dim * flags["clusters"]
LFR_optim_objective.steps = 0 # type: ignore[attr-defined]

learned_model = optim.fmin_l_bfgs_b( # type: ignore[attr-defined]
LFR_optim_objective,
learned_model = optim.fmin_l_bfgs_b(
func=LFR_optim_objective,
x0=parameters_initialization,
epsilon=1e-5,
args=(
Expand All @@ -151,11 +151,11 @@ def fit(train: DataTuple, flags: "ZemelArgs", seed: int) -> Model:
print_interval,
verbose,
),
bounds=bnd,
bounds=bnd, # type: ignore[arg-type]
approx_grad=True,
maxfun=flags["maxfun"],
maxiter=flags["max_iter"],
disp=verbose,
disp=verbose, # type: ignore[arg-type]
)[0]
w = learned_model[: flags["clusters"]]
prototypes = learned_model[flags["clusters"] :].reshape((flags["clusters"], features_dim))
Expand Down
2 changes: 1 addition & 1 deletion ethicml/models/postprocess/dp_flip.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def _flip(
idx_s_y = _y.index.intersection(_s.index)
rng = np.random.RandomState(seed)
idxs = list(rng.permutation(idx_s_y))
update = pd.Series({idx: post_y_val for idx in idxs[:num_to_flip]}, dtype=preds.hard.dtype)
update = pd.Series(dict.fromkeys(idxs[:num_to_flip], post_y_val), dtype=preds.hard.dtype)
preds.hard.update(update)
return preds

Expand Down
2 changes: 1 addition & 1 deletion ethicml/models/postprocess/hardt.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from numpy.random import RandomState
import pandas as pd
from scipy.optimize import OptimizeResult, linprog # type: ignore[attr-defined]
from scipy.optimize import OptimizeResult, linprog

from ethicml.metrics.per_sensitive_attribute import metric_per_sens
from ethicml.metrics.tnr import TNR
Expand Down
2 changes: 1 addition & 1 deletion ethicml/utility/data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ def write_as_npz(
for entry, values in data.items()
}

np.savez(data_path, **as_numpy, **column_names, **extra)
np.savez(data_path, allow_pickle=False, **as_numpy, **column_names, **extra)


def concat(datatup_list: Sequence[T], *, ignore_index: bool = False) -> T:
Expand Down
11 changes: 5 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ data = [
plot = [
"matplotlib>=3.8",
"seaborn>=0.9.0",
"imageio",
]
metrics = [
"scikit-learn>=0.20.1",
Expand All @@ -69,9 +70,10 @@ test = [
"pip<23.0.0,>=22.3.1",
]
typecheck = [
"microsoft-python-type-stubs @ git+https://github.com/microsoft/python-type-stubs.git@76ca370",
"mypy>=0.990",
"pandas-stubs>=1.4.2.220626",
"python-type-stubs @ git+https://github.com/wearepal/python-type-stubs.git@8d5f608",
"scipy-stubs>=1.15.3.0",
"types-seaborn<1.0.0.0,>=0.13.2.20240205",
]
lint = [
Expand Down Expand Up @@ -157,10 +159,7 @@ module = [
"aif360.*",
"cloudpickle",
"fairlearn.*",
"folktables.*",
"imageio",
"scipy.*",
"sklearn.*",
]
ignore_missing_imports = true

Expand All @@ -186,8 +185,8 @@ reportUnknownVariableType = "none"
reportUnknownParameterType = "none"
reportUnknownArgumentType = "none"
reportUnknownLambdaType = "none"
venvPath = "/home/tmk/.cache/pypoetry/virtualenvs"
venv = "ethicml-dzQunYke-py3.10"
venvPath = "."
venv = ".venv"

[tool.ruff]
line-length = 100
Expand Down
6 changes: 3 additions & 3 deletions tests/models_test/inprocess_test/models_inprocessing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class InprocessTest(NamedTuple):


INPROCESS_TESTS = [
InprocessTest(name="Adversarial Debiasing", model=AdvDebiasing(dir=TMPDIR), num_pos=45),
InprocessTest(name="Adversarial Debiasing", model=AdvDebiasing(dir=TMPDIR), num_pos=40),
InprocessTest(name="Agarwal, lr, dp, 0.1", model=Agarwal(dir=TMPDIR), num_pos=45),
InprocessTest(
name="Agarwal, gbt, dp, 0.1",
Expand Down Expand Up @@ -103,9 +103,9 @@ class InprocessTest(NamedTuple):
name="HGR linear_model", model=HGR(dir=TMPDIR, model_type=ModelType.linear), num_pos=60
),
InprocessTest(
name="HGR deep_model", model=HGR(dir=TMPDIR, model_type=ModelType.deep), num_pos=69
name="HGR deep_model", model=HGR(dir=TMPDIR, model_type=ModelType.deep), num_pos=68
),
InprocessTest(name="Fair Dummies deep_model", model=FairDummies(dir=TMPDIR), num_pos=59),
InprocessTest(name="Fair Dummies deep_model", model=FairDummies(dir=TMPDIR), num_pos=58),
InprocessTest(name="Kamiran & Calders lr C=1.0", model=Reweighting(), num_pos=44),
InprocessTest(name="Logistic Regression (C=1.0)", model=LR(), num_pos=44),
InprocessTest(name="LRCV", model=LRCV(), num_pos=40),
Expand Down
3 changes: 1 addition & 2 deletions tests/run_algorithm_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Test that an algorithm can run against some data."""

from dataclasses import dataclass
import os
from pathlib import Path
from typing import Literal
from typing_extensions import Self
Expand Down Expand Up @@ -186,7 +185,7 @@ def test_run_alg_suite() -> None:
topic="pytest",
)

files = os.listdir(Path() / "results")
files = [p.name for p in (Path() / "results").iterdir()]
file_names = [
"pytest_Adult Race-Binary_Upsample uniform.csv",
"pytest_Adult Race-Binary_no_transform.csv",
Expand Down
14 changes: 7 additions & 7 deletions tests/saving_data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ def test_dataset_name_none() -> None:
"""Tests that a DataTuple can be saved without the name property."""
datatup = DataTuple.from_df(
x=pd.DataFrame([3.0], columns=["a1"]),
s=pd.Series([4.0], name="b2"),
y=pd.Series([6.0], name="c3"),
s=pd.Series([4], name="b2"),
y=pd.Series([6], name="c3"),
name=None,
)
with TemporaryDirectory() as tmpdir:
Expand All @@ -110,7 +110,7 @@ def test_dataset_name_with_spaces() -> None:
"""Tests that a dataset name can contain spaces and special chars."""
name = "This is a very@#$%^&*((())) complicated name"
datatup = SubgroupTuple.from_df(
x=pd.DataFrame([3.0], columns=["a1"]), s=pd.Series([4.0], name="b2"), name=name
x=pd.DataFrame([3.0], columns=["a1"]), s=pd.Series([4], name="b2"), name=name
)
with TemporaryDirectory() as tmpdir:
tmp_path = Path(tmpdir)
Expand All @@ -127,8 +127,8 @@ def test_apply_to_joined_df() -> None:
"""Tests apply_to_joined_df_function."""
datatup = DataTuple.from_df(
x=pd.DataFrame([3.0], columns=["a1"]),
s=pd.Series([4.0], name="b2"),
y=pd.Series([6.0], name="c3"),
s=pd.Series([4], name="b2"),
y=pd.Series([6], name="c3"),
name=None,
)

Expand All @@ -152,8 +152,8 @@ def test_data_tuple_len() -> None:

datatup_equal_len = DataTuple.from_df(
x=pd.DataFrame([3.0, 2.0, 1.0], columns=["a1"]),
s=pd.Series([4.0, 5.0, 9.0], name="b2"),
y=pd.Series([6.0, 4.2, 6.7], name="c3"),
s=pd.Series([4, 5, 9], name="b2"),
y=pd.Series([6, 4, 6], name="c3"),
name=None,
)
assert len(datatup_equal_len) == 3
Expand Down
14 changes: 14 additions & 0 deletions typings/folktables/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from .acs import ACSDataSource as ACSDataSource
from .acs import ACSEmployment as ACSEmployment
from .acs import ACSEmploymentFiltered as ACSEmploymentFiltered
from .acs import ACSHealthInsurance as ACSHealthInsurance
from .acs import ACSIncome as ACSIncome
from .acs import ACSIncomePovertyRatio as ACSIncomePovertyRatio
from .acs import ACSMobility as ACSMobility
from .acs import ACSPublicCoverage as ACSPublicCoverage
from .acs import ACSTravelTime as ACSTravelTime
from .acs import adult_filter as adult_filter
from .folktables import BasicProblem as BasicProblem
from .folktables import DataSource as DataSource
from .folktables import Problem as Problem
from .load_acs import generate_categories as generate_categories
Loading
Loading