Skip to content

Delta housekeeping notebooks #95

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ Operations are applied concurrently across multiple tables
* OPTIMIZE with z-order on tables having specified columns
* Detect tables having too many small files ([example notebook](examples/detect_small_files.py))
* Visualise quantity of data written per table per period
* Delta housekeeping analysis ([example notebook](examples/exec_delta_housekeeping.py)) which provide:
* stats (size of tables and number of files, timestamps of latest OPTIMIZE & VACUUM operations, stats of OPTIMIZE)
* recommendations on tables that need to be OPTIMIZED/VACUUM'ed
* are tables OPTIMIZED/VACUUM'ed often enough
* tables that have small files / tables for which ZORDER is not being effective
* **Governance**
* PII detection with Presidio ([example notebook](examples/pii_detection_presidio.py))
* Text Analysis with MosaicML and Databricks MLflow ([example notebook](examples/text_analysis_mosaicml_mlflow.py))
Expand Down
378 changes: 378 additions & 0 deletions discoverx/delta_housekeeping.py

Large diffs are not rendered by default.

14 changes: 12 additions & 2 deletions discoverx/explorer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import concurrent.futures
import copy
import re
from typing import Optional, List
import pandas as pd
from typing import Optional, List, Callable, Iterable
from discoverx import logging
from discoverx.common import helper
from discoverx.discovery import Discovery
Expand All @@ -11,6 +12,7 @@
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import lit
from discoverx.table_info import InfoFetcher, TableInfo
from discoverx.delta_housekeeping import DeltaHousekeeping, DeltaHousekeepingActions


logger = logging.Logging()
Expand Down Expand Up @@ -147,7 +149,7 @@ def scan(
discover.scan(rules=rules, sample_size=sample_size, what_if=what_if)
return discover

def map(self, f) -> list[any]:
def map(self, f: Callable) -> list[any]:
"""Runs a function for each table in the data explorer

Args:
Expand Down Expand Up @@ -178,6 +180,14 @@ def map(self, f) -> list[any]:

return res

def delta_housekeeping(self) -> pd.DataFrame:
"""
Gathers stats and recommendations on Delta Housekeeping
"""
dh = DeltaHousekeeping(self._spark)
dfs_pd: Iterable[pd.DataFrame] = self.map(dh.scan)
return DeltaHousekeepingActions(dfs_pd, spark=self._spark)


class DataExplorerActions:
def __init__(
Expand Down
47 changes: 47 additions & 0 deletions examples/exec_delta_housekeeping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Databricks notebook source
# MAGIC %md
# MAGIC # Run Delta Housekeeping across multiple tables
# MAGIC Analysis that provides stats on Delta tables / recommendations for improvements, including:
# MAGIC - stats:size of tables and number of files, timestamps of latest OPTIMIZE & VACUUM operations, stats of OPTIMIZE)
# MAGIC - recommendations on tables that need to be OPTIMIZED/VACUUM'ed
# MAGIC - are tables OPTIMIZED/VACUUM'ed often enough
# MAGIC - tables that have small files / tables for which ZORDER is not being effective
# MAGIC

# COMMAND ----------

# MAGIC %pip install dbl-discoverx

# COMMAND ----------

from discoverx import DX

dx = DX()

# COMMAND ----------

# DBTITLE 1,Run the discoverx DeltaHousekeeping operation -generates an output object you can apply operations to
output = (
dx.from_tables("lorenzorubi.*.*")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uh, I missed this.
Could you please move "lorenzorubi.." to a widget, and if you want replace it with another example catalog name?

.delta_housekeeping()
)

# COMMAND ----------

# DBTITLE 1,apply() operation generates a spark dataframe with recommendations
result = output.apply()
result.select("catalog", "database", "tableName", "recommendation").display()

# COMMAND ----------

# DBTITLE 1,display() runs apply and displays the full result (including stats per table)
output.display()

# COMMAND ----------

# DBTITLE 1,explain() outputs the DeltaHousekeeping recommendations in HTML format
output.explain()

# COMMAND ----------


2 changes: 2 additions & 0 deletions tests/unit/data/delta_housekeeping/dd_click_sales.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
catalog,database,tableName,number_of_files,bytes
lorenzorubi,default,click_sales,6,326068799
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
catalog,database,tableName,number_of_files,bytes
lorenzorubi,default,housekeeping_summary,1,192917
4 changes: 4 additions & 0 deletions tests/unit/data/delta_housekeeping/dh_click_sales.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
catalog,database,tableName,operation,timestamp,min_file_size,p50_file_size,max_file_size,z_order_by
lorenzorubi,default,click_sales,VACUUM END,2023-12-06T16:40:28Z,null,null,null,null
lorenzorubi,default,click_sales,VACUUM END,2023-12-05T01:19:47Z,null,null,null,null
lorenzorubi,default,click_sales,VACUUM END,2023-11-25T04:03:41Z,null,null,null,null
25 changes: 25 additions & 0 deletions tests/unit/data/delta_housekeeping/dh_housekeeping_summary.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
catalog,database,tableName,operation,timestamp,min_file_size,p50_file_size,max_file_size,z_order_by
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T05:50:14Z,192917,192917,192917,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T05:21:22Z,184203,184203,184203,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T04:37:19Z,176955,176955,176955,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T04:10:26Z,168560,168560,168560,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T03:11:02Z,161710,161710,161710,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T02:44:41Z,154166,154166,154166,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T02:18:54Z,145990,145990,145990,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T01:42:12Z,137677,137677,137677,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T01:09:19Z,130864,130864,130864,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:53:33Z,123702,123702,123702,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:43:44Z,118806,118806,118806,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:28:00Z,111983,111983,111983,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:14:21Z,104790,104790,104790,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T23:47:02Z,97314,97314,97314,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T23:18:17Z,91509,91509,91509,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T22:14:48Z,84152,84152,84152,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T21:57:53Z,76464,76464,76464,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T21:30:49Z,67498,67498,67498,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T21:18:59Z,59412,59412,59412,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T20:30:48Z,51173,51173,51173,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T20:12:59Z,42346,42346,42346,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T19:35:05Z,34463,34463,34463,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T19:30:46Z,28604,28604,28604,[]
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T19:06:51Z,8412,17592,17592,[]
20 changes: 20 additions & 0 deletions tests/unit/data/delta_housekeeping/dhk_pandas_result.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error
lorenzorubi,default,housekeeping_summary_v3,1,3787,null,null,null,null,null,null,null,null,null
lorenzorubi,maxmind_geo,gold_ipv6,1,4907069,null,null,null,null,null,null,null,null,null
lorenzorubi,default,click_sales,6,326068799,null,null,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,null,null,null,null,null
lorenzorubi,default,housekeeping_summary,1,192917,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,null,null,192917,192917,192917,[],null
lorenzorubi,default,housekeeping_summary_v2,3,12326,2023-12-18T11:25:35Z,null,null,null,5273,5273,5273,[],null
lorenzorubi,maxmind_geo,raw_locations,1,6933,null,null,null,null,null,null,null,null,null
lorenzorubi,tpch,customer,1,61897021,null,null,null,null,null,null,null,null,null
lorenzorubi,tpch,nation,1,3007,null,null,null,null,null,null,null,null,null
lorenzorubi,maxmind_geo,raw_ipv6,1,1783720,null,null,null,null,null,null,null,null,null
lorenzorubi,maxmind_geo,gold_ipv4,1,7220024,null,null,null,null,null,null,null,null,null
lorenzorubi,dais_dlt_2023,enriched_orders,null,null,null,null,null,null,null,null,null,null,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`enriched_orders` does not support DESCRIBE DETAIL. ; line 2 pos 20
lorenzorubi,default,click_sales_history,1,7710,null,null,null,null,null,null,null,null,null
lorenzorubi,tpch,orders,2406,317120666,null,null,null,null,null,null,null,null,null
lorenzorubi,default,complete_data,6,326060019,null,null,2023-12-06T16:40:36Z,2023-12-05T01:19:25Z,null,null,null,null,null
lorenzorubi,maxmind_geo,raw_ipv4,1,3115269,null,null,null,null,null,null,null,null,null
lorenzorubi,gcp_cost_analysis,sku_prices,1,835,null,null,null,null,null,null,null,null,null
lorenzorubi,dais_dlt_2023,daily_totalorders_by_nation,null,null,null,null,null,null,null,null,null,null,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`daily_totalorders_by_nation` does not support DESCRIBE DETAIL. ; line 2 pos 20
lorenzorubi,gcp_cost_analysis,project_ids,2,1774,null,null,null,null,null,null,null,null,null
lorenzorubi,dais_dlt_2023,daily_2nd_high_orderprice,null,null,null,null,null,null,null,null,null,null,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`daily_2nd_high_orderprice` does not support DESCRIBE DETAIL. ; line 2 pos 20
4 changes: 4 additions & 0 deletions tests/unit/data/delta_housekeeping/expected_need_optimize.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error
lorenzorubi,default,click_sales,6.0,326068799.0,,,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,,,,,
lorenzorubi,tpch,orders,2406.0,317120666.0,,,,,,,,,
lorenzorubi,default,complete_data,6.0,326060019.0,,,2023-12-06T16:40:36Z,2023-12-05T01:19:25Z,,,,,
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by
lorenzorubi,default,click_sales,6,326068799,,,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,,,,
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by
lorenzorubi,default,housekeeping_summary,1,192917,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,,,192917,192917,192917,[]
49 changes: 49 additions & 0 deletions tests/unit/delta_housekeeping_actions_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest
import pandas as pd
from discoverx.delta_housekeeping import DeltaHousekeepingActions
from pathlib import Path


def _resolve_file_path(request, relative_path):
module_path = Path(request.module.__file__)
test_file_path = module_path.parent / relative_path
return pd.read_csv(str(test_file_path.resolve()))


@pytest.fixture()
def housekeeping_stats(request):
return _resolve_file_path(request, "data/delta_housekeeping/dhk_pandas_result.csv")


@pytest.fixture()
def expected_need_optimize(request):
return _resolve_file_path(request, "data/delta_housekeeping/expected_need_optimize.csv")


def test_apply_output(housekeeping_stats, expected_need_optimize):
dha = DeltaHousekeepingActions(
None,
stats=housekeeping_stats,
)
res = dha.generate_recommendations()
assert len(res) == 6
need_optimize = [item for item in res if (list(item.keys())[0] == dha.tables_not_optimized_legend)]
assert len(need_optimize) == 1
need_optimize_df = list(need_optimize[0].values())[0]
pd.testing.assert_frame_equal(
need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]],
expected_need_optimize.loc[:, ["catalog", "database", "tableName"]],
)
# TODO complete all the tests
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove the TODO



def test_empty_apply_output(housekeeping_stats):
dha = DeltaHousekeepingActions(
None,
stats=housekeeping_stats,
min_table_size_optimize=1024*1024*1024*1024
)
res = dha.generate_recommendations()
assert len(res) == 5
need_optimize = [item for item in res if list(item.keys())[0] == dha.tables_not_optimized_legend]
assert len(need_optimize) == 0
98 changes: 98 additions & 0 deletions tests/unit/delta_housekeeping_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import pytest
import pandas as pd
from discoverx.delta_housekeeping import DeltaHousekeeping
from pathlib import Path
import pyspark.sql.functions as F


def _resolve_file_path(request, relative_path):
module_path = Path(request.module.__file__)
test_file_path = module_path.parent / relative_path
return pd.read_csv(
str(test_file_path.resolve()),
dtype={
"max_optimize_timestamp": "str",
"2nd_optimize_timestamp": "str",
"max_vacuum_timestamp": "str",
"2nd_vacuum_timestamp": "str",
}
)


@pytest.fixture()
def dh_click_sales(request):
return _resolve_file_path(request, "data/delta_housekeeping/dh_click_sales.csv")


@pytest.fixture()
def dd_click_sales(request):
return _resolve_file_path(request, "data/delta_housekeeping/dd_click_sales.csv")


@pytest.fixture()
def expected_pdh_click_sales(request):
return _resolve_file_path(request, "data/delta_housekeeping/expected_pdh_click_sales.csv")


@pytest.fixture()
def dh_housekeeping_summary(request):
return _resolve_file_path(request, "data/delta_housekeeping/dh_housekeeping_summary.csv")


@pytest.fixture()
def dd_housekeeping_summary(request):
return _resolve_file_path(request, "data/delta_housekeeping/dd_housekeeping_summary.csv")


@pytest.fixture()
def expected_pdh_housekeeping_summary(request):
return _resolve_file_path(request, "data/delta_housekeeping/expected_pdh_housekeeping_summary.csv")


def test_process_describe_history_no_optimize(spark, dh_click_sales, dd_click_sales, expected_pdh_click_sales):
dh = DeltaHousekeeping(spark)
describe_detail_df = spark.createDataFrame(dd_click_sales)
describe_history_df = spark.createDataFrame(dh_click_sales)
out = dh._process_describe_history(describe_detail_df, describe_history_df)
pd.testing.assert_frame_equal(
out.reset_index(),
expected_pdh_click_sales.reset_index(),
)


def test_process_describe_history_no_vacuum(
spark, dh_housekeeping_summary, dd_housekeeping_summary, expected_pdh_housekeeping_summary
):
dh = DeltaHousekeeping(spark)
describe_detail_df = spark.createDataFrame(dd_housekeeping_summary)
describe_history_df = spark.createDataFrame(dh_housekeeping_summary)
out = dh._process_describe_history(describe_detail_df, describe_history_df)
pd.testing.assert_frame_equal(
out.reset_index(),
expected_pdh_housekeeping_summary.reset_index(),
)


def test_process_describe_history_no_operation(spark, dd_click_sales):
dh = DeltaHousekeeping(spark)
describe_detail_df = spark.createDataFrame(dd_click_sales)
describe_history_df = spark.createDataFrame([], "string")
out = dh._process_describe_history(describe_detail_df, describe_history_df)
# output should be equal to DESCRIBE DETAIL
pd.testing.assert_frame_equal(
out.reset_index(),
dd_click_sales.reset_index(),
)


def test_process_describe_history_empty_history(spark, dd_click_sales, dh_click_sales):
dh = DeltaHousekeeping(spark)
describe_detail_df = spark.createDataFrame(dd_click_sales)
describe_history_df = spark.createDataFrame(dh_click_sales)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: I like to define the DFs inline inside the tests, in order to make the tests more readable.
But that's for the next time :)

describe_history_df = describe_history_df.withColumn("operation", F.lit("NOOP"))
out = dh._process_describe_history(describe_detail_df, describe_history_df)
# output should be equal to DESCRIBE DETAIL
pd.testing.assert_frame_equal(
out.reset_index(),
dd_click_sales.reset_index(),
)
12 changes: 12 additions & 0 deletions tests/unit/explorer_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pandas
import pytest
from discoverx.explorer import DataExplorer, DataExplorerActions, InfoFetcher, TableInfo

Expand Down Expand Up @@ -89,3 +90,14 @@ def test_no_tables_matching_filter(spark, info_fetcher):
data_explorer = DataExplorer("some_catalog.default.non_existent_table", spark, info_fetcher)
with pytest.raises(ValueError):
data_explorer.map(lambda table_info: table_info)


def test_delta_housekeeeping_call(spark, info_fetcher):
data_explorer = DataExplorer("*.default.*", spark, info_fetcher)
result: pandas.DataFrame = data_explorer.delta_housekeeping()._stats
print(result['tableName'].count())
assert result['tableName'].count() == 3
for res in result['tableName'].tolist():
assert res in ["tb_all_types", "tb_1", "tb_2"]
for col in result.columns:
assert col in ["catalog", "database", "tableName", "error"]