-
Notifications
You must be signed in to change notification settings - Fork 17
Delta housekeeping notebooks #95
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5d7889e
90bab27
94629e0
543f852
567b303
bded305
cf4ef07
bc303cd
feeafaf
e8a1b66
e177ef4
023b02f
c2b028f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# Databricks notebook source | ||
# MAGIC %md | ||
# MAGIC # Run Delta Housekeeping across multiple tables | ||
# MAGIC Analysis that provides stats on Delta tables / recommendations for improvements, including: | ||
# MAGIC - stats:size of tables and number of files, timestamps of latest OPTIMIZE & VACUUM operations, stats of OPTIMIZE) | ||
# MAGIC - recommendations on tables that need to be OPTIMIZED/VACUUM'ed | ||
# MAGIC - are tables OPTIMIZED/VACUUM'ed often enough | ||
# MAGIC - tables that have small files / tables for which ZORDER is not being effective | ||
# MAGIC | ||
edurdevic marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %pip install dbl-discoverx | ||
|
||
# COMMAND ---------- | ||
|
||
from discoverx import DX | ||
|
||
dx = DX() | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Run the discoverx DeltaHousekeeping operation -generates an output object you can apply operations to | ||
output = ( | ||
dx.from_tables("lorenzorubi.*.*") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uh, I missed this. |
||
.delta_housekeeping() | ||
) | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,apply() operation generates a spark dataframe with recommendations | ||
result = output.apply() | ||
result.select("catalog", "database", "tableName", "recommendation").display() | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,display() runs apply and displays the full result (including stats per table) | ||
output.display() | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,explain() outputs the DeltaHousekeeping recommendations in HTML format | ||
output.explain() | ||
|
||
# COMMAND ---------- | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
catalog,database,tableName,number_of_files,bytes | ||
lorenzorubi,default,click_sales,6,326068799 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
catalog,database,tableName,number_of_files,bytes | ||
lorenzorubi,default,housekeeping_summary,1,192917 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
catalog,database,tableName,operation,timestamp,min_file_size,p50_file_size,max_file_size,z_order_by | ||
lorenzorubi,default,click_sales,VACUUM END,2023-12-06T16:40:28Z,null,null,null,null | ||
lorenzorubi,default,click_sales,VACUUM END,2023-12-05T01:19:47Z,null,null,null,null | ||
lorenzorubi,default,click_sales,VACUUM END,2023-11-25T04:03:41Z,null,null,null,null |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
catalog,database,tableName,operation,timestamp,min_file_size,p50_file_size,max_file_size,z_order_by | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T05:50:14Z,192917,192917,192917,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T05:21:22Z,184203,184203,184203,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T04:37:19Z,176955,176955,176955,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T04:10:26Z,168560,168560,168560,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T03:11:02Z,161710,161710,161710,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T02:44:41Z,154166,154166,154166,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T02:18:54Z,145990,145990,145990,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T01:42:12Z,137677,137677,137677,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T01:09:19Z,130864,130864,130864,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:53:33Z,123702,123702,123702,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:43:44Z,118806,118806,118806,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:28:00Z,111983,111983,111983,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:14:21Z,104790,104790,104790,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T23:47:02Z,97314,97314,97314,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T23:18:17Z,91509,91509,91509,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T22:14:48Z,84152,84152,84152,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T21:57:53Z,76464,76464,76464,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T21:30:49Z,67498,67498,67498,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T21:18:59Z,59412,59412,59412,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T20:30:48Z,51173,51173,51173,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T20:12:59Z,42346,42346,42346,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T19:35:05Z,34463,34463,34463,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T19:30:46Z,28604,28604,28604,[] | ||
lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T19:06:51Z,8412,17592,17592,[] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error | ||
lorenzorubi,default,housekeeping_summary_v3,1,3787,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,maxmind_geo,gold_ipv6,1,4907069,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,default,click_sales,6,326068799,null,null,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,null,null,null,null,null | ||
lorenzorubi,default,housekeeping_summary,1,192917,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,null,null,192917,192917,192917,[],null | ||
lorenzorubi,default,housekeeping_summary_v2,3,12326,2023-12-18T11:25:35Z,null,null,null,5273,5273,5273,[],null | ||
lorenzorubi,maxmind_geo,raw_locations,1,6933,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,tpch,customer,1,61897021,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,tpch,nation,1,3007,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,maxmind_geo,raw_ipv6,1,1783720,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,maxmind_geo,gold_ipv4,1,7220024,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,dais_dlt_2023,enriched_orders,null,null,null,null,null,null,null,null,null,null,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`enriched_orders` does not support DESCRIBE DETAIL. ; line 2 pos 20 | ||
lorenzorubi,default,click_sales_history,1,7710,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,tpch,orders,2406,317120666,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,default,complete_data,6,326060019,null,null,2023-12-06T16:40:36Z,2023-12-05T01:19:25Z,null,null,null,null,null | ||
lorenzorubi,maxmind_geo,raw_ipv4,1,3115269,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,gcp_cost_analysis,sku_prices,1,835,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,dais_dlt_2023,daily_totalorders_by_nation,null,null,null,null,null,null,null,null,null,null,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`daily_totalorders_by_nation` does not support DESCRIBE DETAIL. ; line 2 pos 20 | ||
lorenzorubi,gcp_cost_analysis,project_ids,2,1774,null,null,null,null,null,null,null,null,null | ||
lorenzorubi,dais_dlt_2023,daily_2nd_high_orderprice,null,null,null,null,null,null,null,null,null,null,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`daily_2nd_high_orderprice` does not support DESCRIBE DETAIL. ; line 2 pos 20 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error | ||
lorenzorubi,default,click_sales,6.0,326068799.0,,,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,,,,, | ||
lorenzorubi,tpch,orders,2406.0,317120666.0,,,,,,,,, | ||
lorenzorubi,default,complete_data,6.0,326060019.0,,,2023-12-06T16:40:36Z,2023-12-05T01:19:25Z,,,,, |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by | ||
lorenzorubi,default,click_sales,6,326068799,,,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,,,, |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by | ||
lorenzorubi,default,housekeeping_summary,1,192917,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,,,192917,192917,192917,[] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import pytest | ||
import pandas as pd | ||
from discoverx.delta_housekeeping import DeltaHousekeepingActions | ||
from pathlib import Path | ||
|
||
|
||
def _resolve_file_path(request, relative_path): | ||
module_path = Path(request.module.__file__) | ||
test_file_path = module_path.parent / relative_path | ||
return pd.read_csv(str(test_file_path.resolve())) | ||
|
||
|
||
@pytest.fixture() | ||
def housekeeping_stats(request): | ||
return _resolve_file_path(request, "data/delta_housekeeping/dhk_pandas_result.csv") | ||
|
||
|
||
@pytest.fixture() | ||
def expected_need_optimize(request): | ||
return _resolve_file_path(request, "data/delta_housekeeping/expected_need_optimize.csv") | ||
|
||
|
||
def test_apply_output(housekeeping_stats, expected_need_optimize): | ||
dha = DeltaHousekeepingActions( | ||
None, | ||
stats=housekeeping_stats, | ||
) | ||
res = dha.generate_recommendations() | ||
assert len(res) == 6 | ||
need_optimize = [item for item in res if (list(item.keys())[0] == dha.tables_not_optimized_legend)] | ||
assert len(need_optimize) == 1 | ||
need_optimize_df = list(need_optimize[0].values())[0] | ||
pd.testing.assert_frame_equal( | ||
need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], | ||
expected_need_optimize.loc[:, ["catalog", "database", "tableName"]], | ||
) | ||
# TODO complete all the tests | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove the TODO |
||
|
||
|
||
def test_empty_apply_output(housekeeping_stats): | ||
dha = DeltaHousekeepingActions( | ||
None, | ||
stats=housekeeping_stats, | ||
min_table_size_optimize=1024*1024*1024*1024 | ||
) | ||
res = dha.generate_recommendations() | ||
assert len(res) == 5 | ||
need_optimize = [item for item in res if list(item.keys())[0] == dha.tables_not_optimized_legend] | ||
assert len(need_optimize) == 0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
import pytest | ||
import pandas as pd | ||
from discoverx.delta_housekeeping import DeltaHousekeeping | ||
from pathlib import Path | ||
import pyspark.sql.functions as F | ||
|
||
|
||
def _resolve_file_path(request, relative_path): | ||
module_path = Path(request.module.__file__) | ||
test_file_path = module_path.parent / relative_path | ||
return pd.read_csv( | ||
str(test_file_path.resolve()), | ||
dtype={ | ||
"max_optimize_timestamp": "str", | ||
"2nd_optimize_timestamp": "str", | ||
"max_vacuum_timestamp": "str", | ||
"2nd_vacuum_timestamp": "str", | ||
} | ||
) | ||
|
||
|
||
@pytest.fixture() | ||
def dh_click_sales(request): | ||
return _resolve_file_path(request, "data/delta_housekeeping/dh_click_sales.csv") | ||
|
||
|
||
@pytest.fixture() | ||
def dd_click_sales(request): | ||
return _resolve_file_path(request, "data/delta_housekeeping/dd_click_sales.csv") | ||
|
||
|
||
@pytest.fixture() | ||
def expected_pdh_click_sales(request): | ||
return _resolve_file_path(request, "data/delta_housekeeping/expected_pdh_click_sales.csv") | ||
|
||
|
||
@pytest.fixture() | ||
def dh_housekeeping_summary(request): | ||
return _resolve_file_path(request, "data/delta_housekeeping/dh_housekeeping_summary.csv") | ||
|
||
|
||
@pytest.fixture() | ||
def dd_housekeeping_summary(request): | ||
return _resolve_file_path(request, "data/delta_housekeeping/dd_housekeeping_summary.csv") | ||
|
||
|
||
@pytest.fixture() | ||
def expected_pdh_housekeeping_summary(request): | ||
return _resolve_file_path(request, "data/delta_housekeeping/expected_pdh_housekeeping_summary.csv") | ||
|
||
|
||
def test_process_describe_history_no_optimize(spark, dh_click_sales, dd_click_sales, expected_pdh_click_sales): | ||
dh = DeltaHousekeeping(spark) | ||
describe_detail_df = spark.createDataFrame(dd_click_sales) | ||
describe_history_df = spark.createDataFrame(dh_click_sales) | ||
out = dh._process_describe_history(describe_detail_df, describe_history_df) | ||
pd.testing.assert_frame_equal( | ||
out.reset_index(), | ||
expected_pdh_click_sales.reset_index(), | ||
) | ||
|
||
|
||
def test_process_describe_history_no_vacuum( | ||
spark, dh_housekeeping_summary, dd_housekeeping_summary, expected_pdh_housekeeping_summary | ||
): | ||
dh = DeltaHousekeeping(spark) | ||
describe_detail_df = spark.createDataFrame(dd_housekeeping_summary) | ||
describe_history_df = spark.createDataFrame(dh_housekeeping_summary) | ||
out = dh._process_describe_history(describe_detail_df, describe_history_df) | ||
pd.testing.assert_frame_equal( | ||
out.reset_index(), | ||
expected_pdh_housekeeping_summary.reset_index(), | ||
) | ||
|
||
|
||
def test_process_describe_history_no_operation(spark, dd_click_sales): | ||
dh = DeltaHousekeeping(spark) | ||
describe_detail_df = spark.createDataFrame(dd_click_sales) | ||
describe_history_df = spark.createDataFrame([], "string") | ||
out = dh._process_describe_history(describe_detail_df, describe_history_df) | ||
# output should be equal to DESCRIBE DETAIL | ||
pd.testing.assert_frame_equal( | ||
out.reset_index(), | ||
dd_click_sales.reset_index(), | ||
) | ||
|
||
|
||
def test_process_describe_history_empty_history(spark, dd_click_sales, dh_click_sales): | ||
dh = DeltaHousekeeping(spark) | ||
describe_detail_df = spark.createDataFrame(dd_click_sales) | ||
describe_history_df = spark.createDataFrame(dh_click_sales) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NIT: I like to define the DFs inline inside the tests, in order to make the tests more readable. |
||
describe_history_df = describe_history_df.withColumn("operation", F.lit("NOOP")) | ||
out = dh._process_describe_history(describe_detail_df, describe_history_df) | ||
# output should be equal to DESCRIBE DETAIL | ||
pd.testing.assert_frame_equal( | ||
out.reset_index(), | ||
dd_click_sales.reset_index(), | ||
) |
Uh oh!
There was an error while loading. Please reload this page.