From f7acd92fe9cc623cd50504844f65bfa6f3eb32e7 Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Fri, 12 Apr 2024 10:54:03 +0200 Subject: [PATCH 01/22] Started new effectivtiy Satellite macro --- macros/tables/eff_sat_v0.sql | 14 ++ macros/tables/snowflake/eff_sat_v0.sql | 208 +++++++++++++++++++++++++ 2 files changed, 222 insertions(+) create mode 100644 macros/tables/eff_sat_v0.sql create mode 100644 macros/tables/snowflake/eff_sat_v0.sql diff --git a/macros/tables/eff_sat_v0.sql b/macros/tables/eff_sat_v0.sql new file mode 100644 index 00000000..af9f10ed --- /dev/null +++ b/macros/tables/eff_sat_v0.sql @@ -0,0 +1,14 @@ +{%- macro eff_sat_v0(source_models, tracked_hashkey, src_ldts=none, src_rsrc=none, deleted_flag_alias=none) -%} + + {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} + {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} + {%- set src_rsrc = datavault4dbt.replace_standard(src_rsrc, 'datavault4dbt.rsrc_alias', 'rsrc') -%} + {%- set deleted_flag_alias = datavault4dbt.replace_standard(deleted_flag_alias, 'datavault4dbt.deleted_flag_alias', 'deleted_flag') -%} + + {{ return(adapter.dispatch('eff_sat_v0', 'datavault4dbt')(tracked_hashkey=tracked_hashkey, + src_ldts=src_ldts, + src_rsrc=src_rsrc, + deleted_flag_alias=deleted_flag_alias, + source_models=source_models) ) + }} +{%- endmacro -%} \ No newline at end of file diff --git a/macros/tables/snowflake/eff_sat_v0.sql b/macros/tables/snowflake/eff_sat_v0.sql new file mode 100644 index 00000000..edfec997 --- /dev/null +++ b/macros/tables/snowflake/eff_sat_v0.sql @@ -0,0 +1,208 @@ +{%- macro snowflake__eff_sat_v0(source_models, existing_eff_model, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} + +{%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} +{%- set timestamp_format = datavault4dbt.timestamp_format() -%} + +{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} + +{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} +{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} +{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} + {%- set source_models = {source_models: {}} -%} +{%- endif -%} + +{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} +{%- set source_models = source_model_values['source_model_list'] -%} +{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} +{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} +{{ log('source_models: '~source_models, false) }} + +{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} + +{{ log('columns to select: '~final_columns_to_select, false) }} + +{{ datavault4dbt.prepend_generated_by() }} + +WITH + +{%- if is_incremental() and execute %} + + current_status_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ deleted_flag_alias }}, + {{ src_rsrc }}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as RN + FROM {{ this }} + + ), + + current_status AS ( + + SELECT + {{ tracked_hashkey }}, + {{ deleted_flag_alias }}, + {{ src_rsrc }} + FROM current_status_prep + WHERE RN = 1 + + ), + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = source_model['tracked_hashkey'] -%} + + new_hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ deleted_flag_alias }} = 0 + WHERE cs.{{ tracked_hashkey }} IS NULL + + {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} + + ), + + {%- endfor -%} + + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + cs.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 1 as {{ deleted_flag_alias }} + FROM current_status cs + WHERE + {% for source_model in source_models %} + {%- set tracked_hashkey_src = source_model['tracked_hashkey'] -%} + {{ 'AND' if not loop.first }} + NOT EXISTS ( + SELECT + 1 + FROM {{ ref(source_model.name) }} src + WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} + ) + {% endfor %} + AND cs.{{ deleted_flag_alias }} = 0 + + ), + + + {%- if source_models | length > 1 -%} + + new_hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + {%- set ns.new_hashkeys_cte = 'new_hashkeys_union' -%} + + ), + + {%- endif -%} + + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.new_hashkeys_cte }} + + UNION + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM disappeared_hashkeys + + ) + +{%- else %} + + {% for source_model in source_models %} + + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = source_model['tracked_hashkey'] -%} + + hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + + {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} + + ), + + {%- endfor -%} + + {%- if source_models | length > 1 -%} + + hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + {%- set ns.last_cte = 'hashkeys_union' -%} + + ), + + {%- endif -%} + + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.last_cte }} + + ) + + +{% endif %} + +SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} +FROM records_to_insert + +{%- endmacro -%} + + + From a62a837e9a924517ca2e29ddcc2fb87ecb2ff413 Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Fri, 23 Aug 2024 16:33:06 +0200 Subject: [PATCH 02/22] Added comments --- macros/tables/snowflake/eff_sat_v0.sql | 63 ++++++++++++++++++-------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/macros/tables/snowflake/eff_sat_v0.sql b/macros/tables/snowflake/eff_sat_v0.sql index edfec997..7c5e3a2a 100644 --- a/macros/tables/snowflake/eff_sat_v0.sql +++ b/macros/tables/snowflake/eff_sat_v0.sql @@ -1,4 +1,4 @@ -{%- macro snowflake__eff_sat_v0(source_models, existing_eff_model, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} +{%- macro snowflake__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} {%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} {%- set timestamp_format = datavault4dbt.timestamp_format() -%} @@ -19,40 +19,46 @@ {%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} +{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} +{%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} +{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} +{%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} + {{ log('columns to select: '~final_columns_to_select, false) }} {{ datavault4dbt.prepend_generated_by() }} WITH - +{# + For incremental runs, three different cases can occur: + hashkey disappeared -> deleted_flag = 1 + hashkey reappeared -> deleted_flag = 0 + new hashkey appeared -> deleted_flag = 0 +#} {%- if is_incremental() and execute %} - current_status_prep AS ( - - SELECT - {{ tracked_hashkey }}, - {{ deleted_flag_alias }}, - {{ src_rsrc }}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as RN - FROM {{ this }} - - ), - + {# + First, the current status for each hashkey is queried + #} current_status AS ( - SELECT + SELECT {{ tracked_hashkey }}, {{ deleted_flag_alias }}, {{ src_rsrc }} - FROM current_status_prep - WHERE RN = 1 + FROM {{ this }} + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 ), {%- for source_model in source_models -%} + {# + For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. + #} {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = source_model['tracked_hashkey'] -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} new_hashkeys_{{ source_number }} AS ( @@ -73,6 +79,10 @@ WITH {%- endfor -%} + {# + All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. + If they can't be found anywhere, they are marked as deleted_flag = 1. + #} disappeared_hashkeys AS ( SELECT DISTINCT @@ -83,7 +93,7 @@ WITH FROM current_status cs WHERE {% for source_model in source_models %} - {%- set tracked_hashkey_src = source_model['tracked_hashkey'] -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} {{ 'AND' if not loop.first }} NOT EXISTS ( SELECT @@ -99,6 +109,9 @@ WITH {%- if source_models | length > 1 -%} + {# + If more then one source_model is defined, the new hashkeys of all source_models are unioned. + #} new_hashkeys_union AS ( {%- for source_model in source_models -%} @@ -124,6 +137,10 @@ WITH {%- endif -%} + {# + All hashkeys that have a status change should be inserted. + That includes new, reappeared, and disappeared hashkeys. + #} records_to_insert AS ( SELECT @@ -140,10 +157,16 @@ WITH {%- else %} + {# + In initial runs, every available hashkey is marked as deleted_flag = 0. + #} {% for source_model in source_models %} + {# + When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + #} {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = source_model['tracked_hashkey'] -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} hashkeys_{{ source_number }} AS ( @@ -176,7 +199,7 @@ WITH FROM hashkeys_{{ source_number }} {%- if not loop.last %} - UNION + UNION {% endif -%} {%- endfor -%} From b0bcd49ecc06a32c36c248038ef3a96b0f48e135 Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Mon, 9 Sep 2024 17:32:21 +0200 Subject: [PATCH 03/22] adjust column spacing --- macros/tables/snowflake/eff_sat_v0.sql | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/macros/tables/snowflake/eff_sat_v0.sql b/macros/tables/snowflake/eff_sat_v0.sql index 7c5e3a2a..2cf174cc 100644 --- a/macros/tables/snowflake/eff_sat_v0.sql +++ b/macros/tables/snowflake/eff_sat_v0.sql @@ -36,7 +36,7 @@ WITH hashkey reappeared -> deleted_flag = 0 new hashkey appeared -> deleted_flag = 0 #} -{%- if is_incremental() and execute %} +{% if is_incremental() and execute %} {# First, the current status for each hashkey is queried @@ -52,7 +52,7 @@ WITH ), - {%- for source_model in source_models -%} + {% for source_model in source_models -%} {# For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. @@ -77,7 +77,7 @@ WITH ), - {%- endfor -%} + {%- endfor %} {# All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. @@ -135,7 +135,7 @@ WITH ), - {%- endif -%} + {%- endif %} {# All hashkeys that have a status change should be inserted. @@ -181,9 +181,9 @@ WITH ), - {%- endfor -%} + {%- endfor %} - {%- if source_models | length > 1 -%} + {% if source_models | length > 1 -%} hashkeys_union AS ( @@ -208,7 +208,7 @@ WITH ), - {%- endif -%} + {%- endif %} records_to_insert AS ( From d668af3ba0c76d2780a917e51af47c77d01d05d0 Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Wed, 11 Sep 2024 15:05:07 +0200 Subject: [PATCH 04/22] Multi source fix --- macros/tables/snowflake/eff_sat_v0.sql | 31 +++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/macros/tables/snowflake/eff_sat_v0.sql b/macros/tables/snowflake/eff_sat_v0.sql index 2cf174cc..2ea2f94f 100644 --- a/macros/tables/snowflake/eff_sat_v0.sql +++ b/macros/tables/snowflake/eff_sat_v0.sql @@ -131,9 +131,21 @@ WITH {%- endfor -%} - {%- set ns.new_hashkeys_cte = 'new_hashkeys_union' -%} + ), + + new_hashkeys_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 - ), + {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + + ), {%- endif %} @@ -204,7 +216,20 @@ WITH {%- endfor -%} - {%- set ns.last_cte = 'hashkeys_union' -%} + + ), + + hashkey_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkey_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'hashkey_union_dedupe' -%} ), From 9b1178e96a2e5c1685eb1b15d2237a8155f30228 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Sep 2024 09:01:28 +0200 Subject: [PATCH 05/22] copy eff_sat_v0 macros to all adapter folders --- macros/tables/bigquery/eff_sat_v0.sql | 256 ++++++++++++++++++++++++++ macros/tables/exasol/eff_sat_v0.sql | 256 ++++++++++++++++++++++++++ macros/tables/postgres/eff_sat_v0.sql | 256 ++++++++++++++++++++++++++ macros/tables/redshift/eff_sat_v0.sql | 256 ++++++++++++++++++++++++++ macros/tables/synapse/eff_sat_v0.sql | 256 ++++++++++++++++++++++++++ 5 files changed, 1280 insertions(+) create mode 100644 macros/tables/bigquery/eff_sat_v0.sql create mode 100644 macros/tables/exasol/eff_sat_v0.sql create mode 100644 macros/tables/postgres/eff_sat_v0.sql create mode 100644 macros/tables/redshift/eff_sat_v0.sql create mode 100644 macros/tables/synapse/eff_sat_v0.sql diff --git a/macros/tables/bigquery/eff_sat_v0.sql b/macros/tables/bigquery/eff_sat_v0.sql new file mode 100644 index 00000000..7f4c07cc --- /dev/null +++ b/macros/tables/bigquery/eff_sat_v0.sql @@ -0,0 +1,256 @@ +{%- macro default__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} + +{%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} +{%- set timestamp_format = datavault4dbt.timestamp_format() -%} + +{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} + +{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} +{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} +{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} + {%- set source_models = {source_models: {}} -%} +{%- endif -%} + +{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} +{%- set source_models = source_model_values['source_model_list'] -%} +{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} +{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} +{{ log('source_models: '~source_models, false) }} + +{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} + +{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} +{%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} +{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} +{%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} + +{{ log('columns to select: '~final_columns_to_select, false) }} + +{{ datavault4dbt.prepend_generated_by() }} + +WITH +{# + For incremental runs, three different cases can occur: + hashkey disappeared -> deleted_flag = 1 + hashkey reappeared -> deleted_flag = 0 + new hashkey appeared -> deleted_flag = 0 +#} +{% if is_incremental() and execute %} + + {# + First, the current status for each hashkey is queried + #} + current_status AS ( + + SELECT + {{ tracked_hashkey }}, + {{ deleted_flag_alias }}, + {{ src_rsrc }} + FROM {{ this }} + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 + + ), + + {% for source_model in source_models -%} + + {# + For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. + #} + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + + new_hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ deleted_flag_alias }} = 0 + WHERE cs.{{ tracked_hashkey }} IS NULL + + {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} + + ), + + {%- endfor %} + + {# + All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. + If they can't be found anywhere, they are marked as deleted_flag = 1. + #} + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + cs.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 1 as {{ deleted_flag_alias }} + FROM current_status cs + WHERE + {% for source_model in source_models %} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + {{ 'AND' if not loop.first }} + NOT EXISTS ( + SELECT + 1 + FROM {{ ref(source_model.name) }} src + WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} + ) + {% endfor %} + AND cs.{{ deleted_flag_alias }} = 0 + + ), + + + {%- if source_models | length > 1 -%} + + {# + If more then one source_model is defined, the new hashkeys of all source_models are unioned. + #} + new_hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + ), + + new_hashkeys_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + + ), + + {%- endif %} + + {# + All hashkeys that have a status change should be inserted. + That includes new, reappeared, and disappeared hashkeys. + #} + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.new_hashkeys_cte }} + + UNION + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM disappeared_hashkeys + + ) + +{%- else %} + + {# + In initial runs, every available hashkey is marked as deleted_flag = 0. + #} + {% for source_model in source_models %} + + {# + When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + #} + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + + hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + + {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} + + ), + + {%- endfor %} + + {% if source_models | length > 1 -%} + + hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + + ), + + hashkey_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkey_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'hashkey_union_dedupe' -%} + + ), + + {%- endif %} + + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.last_cte }} + + ) + + +{% endif %} + +SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} +FROM records_to_insert + +{%- endmacro -%} + + + diff --git a/macros/tables/exasol/eff_sat_v0.sql b/macros/tables/exasol/eff_sat_v0.sql new file mode 100644 index 00000000..4eff2dfe --- /dev/null +++ b/macros/tables/exasol/eff_sat_v0.sql @@ -0,0 +1,256 @@ +{%- macro exasol__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} + +{%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} +{%- set timestamp_format = datavault4dbt.timestamp_format() -%} + +{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} + +{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} +{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} +{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} + {%- set source_models = {source_models: {}} -%} +{%- endif -%} + +{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} +{%- set source_models = source_model_values['source_model_list'] -%} +{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} +{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} +{{ log('source_models: '~source_models, false) }} + +{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} + +{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} +{%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} +{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} +{%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} + +{{ log('columns to select: '~final_columns_to_select, false) }} + +{{ datavault4dbt.prepend_generated_by() }} + +WITH +{# + For incremental runs, three different cases can occur: + hashkey disappeared -> deleted_flag = 1 + hashkey reappeared -> deleted_flag = 0 + new hashkey appeared -> deleted_flag = 0 +#} +{% if is_incremental() and execute %} + + {# + First, the current status for each hashkey is queried + #} + current_status AS ( + + SELECT + {{ tracked_hashkey }}, + {{ deleted_flag_alias }}, + {{ src_rsrc }} + FROM {{ this }} + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 + + ), + + {% for source_model in source_models -%} + + {# + For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. + #} + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + + new_hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ deleted_flag_alias }} = 0 + WHERE cs.{{ tracked_hashkey }} IS NULL + + {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} + + ), + + {%- endfor %} + + {# + All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. + If they can't be found anywhere, they are marked as deleted_flag = 1. + #} + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + cs.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 1 as {{ deleted_flag_alias }} + FROM current_status cs + WHERE + {% for source_model in source_models %} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + {{ 'AND' if not loop.first }} + NOT EXISTS ( + SELECT + 1 + FROM {{ ref(source_model.name) }} src + WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} + ) + {% endfor %} + AND cs.{{ deleted_flag_alias }} = 0 + + ), + + + {%- if source_models | length > 1 -%} + + {# + If more then one source_model is defined, the new hashkeys of all source_models are unioned. + #} + new_hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + ), + + new_hashkeys_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + + ), + + {%- endif %} + + {# + All hashkeys that have a status change should be inserted. + That includes new, reappeared, and disappeared hashkeys. + #} + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.new_hashkeys_cte }} + + UNION + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM disappeared_hashkeys + + ) + +{%- else %} + + {# + In initial runs, every available hashkey is marked as deleted_flag = 0. + #} + {% for source_model in source_models %} + + {# + When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + #} + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + + hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + + {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} + + ), + + {%- endfor %} + + {% if source_models | length > 1 -%} + + hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + + ), + + hashkey_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkey_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'hashkey_union_dedupe' -%} + + ), + + {%- endif %} + + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.last_cte }} + + ) + + +{% endif %} + +SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} +FROM records_to_insert + +{%- endmacro -%} + + + diff --git a/macros/tables/postgres/eff_sat_v0.sql b/macros/tables/postgres/eff_sat_v0.sql new file mode 100644 index 00000000..ed908c82 --- /dev/null +++ b/macros/tables/postgres/eff_sat_v0.sql @@ -0,0 +1,256 @@ +{%- macro postgres__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} + +{%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} +{%- set timestamp_format = datavault4dbt.timestamp_format() -%} + +{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} + +{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} +{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} +{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} + {%- set source_models = {source_models: {}} -%} +{%- endif -%} + +{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} +{%- set source_models = source_model_values['source_model_list'] -%} +{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} +{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} +{{ log('source_models: '~source_models, false) }} + +{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} + +{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} +{%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} +{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} +{%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} + +{{ log('columns to select: '~final_columns_to_select, false) }} + +{{ datavault4dbt.prepend_generated_by() }} + +WITH +{# + For incremental runs, three different cases can occur: + hashkey disappeared -> deleted_flag = 1 + hashkey reappeared -> deleted_flag = 0 + new hashkey appeared -> deleted_flag = 0 +#} +{% if is_incremental() and execute %} + + {# + First, the current status for each hashkey is queried + #} + current_status AS ( + + SELECT + {{ tracked_hashkey }}, + {{ deleted_flag_alias }}, + {{ src_rsrc }} + FROM {{ this }} + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 + + ), + + {% for source_model in source_models -%} + + {# + For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. + #} + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + + new_hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ deleted_flag_alias }} = 0 + WHERE cs.{{ tracked_hashkey }} IS NULL + + {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} + + ), + + {%- endfor %} + + {# + All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. + If they can't be found anywhere, they are marked as deleted_flag = 1. + #} + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + cs.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 1 as {{ deleted_flag_alias }} + FROM current_status cs + WHERE + {% for source_model in source_models %} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + {{ 'AND' if not loop.first }} + NOT EXISTS ( + SELECT + 1 + FROM {{ ref(source_model.name) }} src + WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} + ) + {% endfor %} + AND cs.{{ deleted_flag_alias }} = 0 + + ), + + + {%- if source_models | length > 1 -%} + + {# + If more then one source_model is defined, the new hashkeys of all source_models are unioned. + #} + new_hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + ), + + new_hashkeys_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + + ), + + {%- endif %} + + {# + All hashkeys that have a status change should be inserted. + That includes new, reappeared, and disappeared hashkeys. + #} + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.new_hashkeys_cte }} + + UNION + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM disappeared_hashkeys + + ) + +{%- else %} + + {# + In initial runs, every available hashkey is marked as deleted_flag = 0. + #} + {% for source_model in source_models %} + + {# + When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + #} + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + + hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + + {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} + + ), + + {%- endfor %} + + {% if source_models | length > 1 -%} + + hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + + ), + + hashkey_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkey_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'hashkey_union_dedupe' -%} + + ), + + {%- endif %} + + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.last_cte }} + + ) + + +{% endif %} + +SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} +FROM records_to_insert + +{%- endmacro -%} + + + diff --git a/macros/tables/redshift/eff_sat_v0.sql b/macros/tables/redshift/eff_sat_v0.sql new file mode 100644 index 00000000..dffdbcf3 --- /dev/null +++ b/macros/tables/redshift/eff_sat_v0.sql @@ -0,0 +1,256 @@ +{%- macro redshift__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} + +{%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} +{%- set timestamp_format = datavault4dbt.timestamp_format() -%} + +{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} + +{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} +{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} +{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} + {%- set source_models = {source_models: {}} -%} +{%- endif -%} + +{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} +{%- set source_models = source_model_values['source_model_list'] -%} +{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} +{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} +{{ log('source_models: '~source_models, false) }} + +{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} + +{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} +{%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} +{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} +{%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} + +{{ log('columns to select: '~final_columns_to_select, false) }} + +{{ datavault4dbt.prepend_generated_by() }} + +WITH +{# + For incremental runs, three different cases can occur: + hashkey disappeared -> deleted_flag = 1 + hashkey reappeared -> deleted_flag = 0 + new hashkey appeared -> deleted_flag = 0 +#} +{% if is_incremental() and execute %} + + {# + First, the current status for each hashkey is queried + #} + current_status AS ( + + SELECT + {{ tracked_hashkey }}, + {{ deleted_flag_alias }}, + {{ src_rsrc }} + FROM {{ this }} + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 + + ), + + {% for source_model in source_models -%} + + {# + For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. + #} + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + + new_hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ deleted_flag_alias }} = 0 + WHERE cs.{{ tracked_hashkey }} IS NULL + + {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} + + ), + + {%- endfor %} + + {# + All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. + If they can't be found anywhere, they are marked as deleted_flag = 1. + #} + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + cs.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 1 as {{ deleted_flag_alias }} + FROM current_status cs + WHERE + {% for source_model in source_models %} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + {{ 'AND' if not loop.first }} + NOT EXISTS ( + SELECT + 1 + FROM {{ ref(source_model.name) }} src + WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} + ) + {% endfor %} + AND cs.{{ deleted_flag_alias }} = 0 + + ), + + + {%- if source_models | length > 1 -%} + + {# + If more then one source_model is defined, the new hashkeys of all source_models are unioned. + #} + new_hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + ), + + new_hashkeys_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + + ), + + {%- endif %} + + {# + All hashkeys that have a status change should be inserted. + That includes new, reappeared, and disappeared hashkeys. + #} + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.new_hashkeys_cte }} + + UNION + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM disappeared_hashkeys + + ) + +{%- else %} + + {# + In initial runs, every available hashkey is marked as deleted_flag = 0. + #} + {% for source_model in source_models %} + + {# + When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + #} + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + + hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + + {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} + + ), + + {%- endfor %} + + {% if source_models | length > 1 -%} + + hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + + ), + + hashkey_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkey_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'hashkey_union_dedupe' -%} + + ), + + {%- endif %} + + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.last_cte }} + + ) + + +{% endif %} + +SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} +FROM records_to_insert + +{%- endmacro -%} + + + diff --git a/macros/tables/synapse/eff_sat_v0.sql b/macros/tables/synapse/eff_sat_v0.sql new file mode 100644 index 00000000..6e0fbf83 --- /dev/null +++ b/macros/tables/synapse/eff_sat_v0.sql @@ -0,0 +1,256 @@ +{%- macro synapse__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} + +{%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} +{%- set timestamp_format = datavault4dbt.timestamp_format() -%} + +{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} + +{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} +{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} +{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} + {%- set source_models = {source_models: {}} -%} +{%- endif -%} + +{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} +{%- set source_models = source_model_values['source_model_list'] -%} +{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} +{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} +{{ log('source_models: '~source_models, false) }} + +{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} + +{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} +{%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} +{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} +{%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} + +{{ log('columns to select: '~final_columns_to_select, false) }} + +{{ datavault4dbt.prepend_generated_by() }} + +WITH +{# + For incremental runs, three different cases can occur: + hashkey disappeared -> deleted_flag = 1 + hashkey reappeared -> deleted_flag = 0 + new hashkey appeared -> deleted_flag = 0 +#} +{% if is_incremental() and execute %} + + {# + First, the current status for each hashkey is queried + #} + current_status AS ( + + SELECT + {{ tracked_hashkey }}, + {{ deleted_flag_alias }}, + {{ src_rsrc }} + FROM {{ this }} + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 + + ), + + {% for source_model in source_models -%} + + {# + For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. + #} + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + + new_hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ deleted_flag_alias }} = 0 + WHERE cs.{{ tracked_hashkey }} IS NULL + + {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} + + ), + + {%- endfor %} + + {# + All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. + If they can't be found anywhere, they are marked as deleted_flag = 1. + #} + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + cs.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 1 as {{ deleted_flag_alias }} + FROM current_status cs + WHERE + {% for source_model in source_models %} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + {{ 'AND' if not loop.first }} + NOT EXISTS ( + SELECT + 1 + FROM {{ ref(source_model.name) }} src + WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} + ) + {% endfor %} + AND cs.{{ deleted_flag_alias }} = 0 + + ), + + + {%- if source_models | length > 1 -%} + + {# + If more then one source_model is defined, the new hashkeys of all source_models are unioned. + #} + new_hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + ), + + new_hashkeys_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM new_hashkeys_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + + ), + + {%- endif %} + + {# + All hashkeys that have a status change should be inserted. + That includes new, reappeared, and disappeared hashkeys. + #} + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.new_hashkeys_cte }} + + UNION + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM disappeared_hashkeys + + ) + +{%- else %} + + {# + In initial runs, every available hashkey is marked as deleted_flag = 0. + #} + {% for source_model in source_models %} + + {# + When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + #} + {%- set source_number = source_model.id | string -%} + {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} + + hashkeys_{{ source_number }} AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, + src.{{ src_rsrc }}, + {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + 0 as {{ deleted_flag_alias }} + FROM {{ ref(source_model.name) }} src + + {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} + + ), + + {%- endfor %} + + {% if source_models | length > 1 -%} + + hashkeys_union AS ( + + {%- for source_model in source_models -%} + + {%- set source_number = source_model.id | string -%} + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkeys_{{ source_number }} + + {%- if not loop.last %} + UNION + {% endif -%} + + {%- endfor -%} + + + ), + + hashkey_union_dedupe AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }} + FROM hashkey_union_dedupe_prep + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'hashkey_union_dedupe' -%} + + ), + + {%- endif %} + + records_to_insert AS ( + + SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} + FROM {{ ns.last_cte }} + + ) + + +{% endif %} + +SELECT + {{ datavault4dbt.print_list(final_columns_to_select) }} +FROM records_to_insert + +{%- endmacro -%} + + + From 543075c60ffd5bc7f5c9fca1e00d671ca78e51ec Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Sep 2024 10:15:38 +0200 Subject: [PATCH 06/22] implement eff_sat_v0 for all adapters --- macros/tables/bigquery/eff_sat_v0.sql | 13 +++---- macros/tables/exasol/eff_sat_v0.sql | 7 +--- macros/tables/postgres/eff_sat_v0.sql | 51 +++++++++++++++++++++----- macros/tables/redshift/eff_sat_v0.sql | 51 +++++++++++++++++++++----- macros/tables/snowflake/eff_sat_v0.sql | 7 +--- macros/tables/synapse/eff_sat_v0.sql | 50 ++++++++++++++++++++----- 6 files changed, 134 insertions(+), 45 deletions(-) diff --git a/macros/tables/bigquery/eff_sat_v0.sql b/macros/tables/bigquery/eff_sat_v0.sql index 7f4c07cc..0426156d 100644 --- a/macros/tables/bigquery/eff_sat_v0.sql +++ b/macros/tables/bigquery/eff_sat_v0.sql @@ -126,7 +126,7 @@ WITH FROM new_hashkeys_{{ source_number }} {%- if not loop.last %} - UNION + UNION ALL {% endif -%} {%- endfor -%} @@ -140,7 +140,7 @@ WITH {{ src_rsrc }}, {{ src_ldts }}, {{ deleted_flag_alias }} - FROM new_hashkeys_union_dedupe_prep + FROM new_hashkeys_union QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} @@ -159,7 +159,7 @@ WITH {{ datavault4dbt.print_list(final_columns_to_select) }} FROM {{ ns.new_hashkeys_cte }} - UNION + UNION ALL SELECT {{ datavault4dbt.print_list(final_columns_to_select) }} @@ -211,7 +211,7 @@ WITH FROM hashkeys_{{ source_number }} {%- if not loop.last %} - UNION + UNION ALL {% endif -%} {%- endfor -%} @@ -226,7 +226,7 @@ WITH {{ src_rsrc }}, {{ src_ldts }}, {{ deleted_flag_alias }} - FROM hashkey_union_dedupe_prep + FROM hashkeys_union QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 {%- set ns.last_cte = 'hashkey_union_dedupe' -%} @@ -251,6 +251,3 @@ SELECT FROM records_to_insert {%- endmacro -%} - - - diff --git a/macros/tables/exasol/eff_sat_v0.sql b/macros/tables/exasol/eff_sat_v0.sql index 4eff2dfe..8f119f17 100644 --- a/macros/tables/exasol/eff_sat_v0.sql +++ b/macros/tables/exasol/eff_sat_v0.sql @@ -140,7 +140,7 @@ WITH {{ src_rsrc }}, {{ src_ldts }}, {{ deleted_flag_alias }} - FROM new_hashkeys_union_dedupe_prep + FROM new_hashkeys_union QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} @@ -226,7 +226,7 @@ WITH {{ src_rsrc }}, {{ src_ldts }}, {{ deleted_flag_alias }} - FROM hashkey_union_dedupe_prep + FROM hashkeys_union QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 {%- set ns.last_cte = 'hashkey_union_dedupe' -%} @@ -251,6 +251,3 @@ SELECT FROM records_to_insert {%- endmacro -%} - - - diff --git a/macros/tables/postgres/eff_sat_v0.sql b/macros/tables/postgres/eff_sat_v0.sql index ed908c82..5883eb9a 100644 --- a/macros/tables/postgres/eff_sat_v0.sql +++ b/macros/tables/postgres/eff_sat_v0.sql @@ -41,20 +41,32 @@ WITH {# First, the current status for each hashkey is queried #} + + current_status_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ deleted_flag_alias }}, + {{ src_rsrc }}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + FROM {{ this }} + + ), + current_status AS ( SELECT {{ tracked_hashkey }}, {{ deleted_flag_alias }}, {{ src_rsrc }} - FROM {{ this }} - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 + FROM current_status_prep + WHERE rn = 1 ), {% for source_model in source_models -%} - {# + {# For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. #} {%- set source_number = source_model.id | string -%} @@ -133,6 +145,18 @@ WITH ), + new_hashkeys_union_dedupe_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + FROM new_hashkeys_union + + ), + new_hashkeys_union_dedupe AS ( SELECT @@ -141,11 +165,11 @@ WITH {{ src_ldts }}, {{ deleted_flag_alias }} FROM new_hashkeys_union_dedupe_prep - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + WHERE rn = 1 {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} - ), + ), {%- endif %} @@ -219,6 +243,18 @@ WITH ), + hashkey_union_dedupe_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + FROM hashkeys_union + + ), + hashkey_union_dedupe AS ( SELECT @@ -227,7 +263,7 @@ WITH {{ src_ldts }}, {{ deleted_flag_alias }} FROM hashkey_union_dedupe_prep - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + WHERE rn = 1 {%- set ns.last_cte = 'hashkey_union_dedupe' -%} @@ -251,6 +287,3 @@ SELECT FROM records_to_insert {%- endmacro -%} - - - diff --git a/macros/tables/redshift/eff_sat_v0.sql b/macros/tables/redshift/eff_sat_v0.sql index dffdbcf3..b89f99fe 100644 --- a/macros/tables/redshift/eff_sat_v0.sql +++ b/macros/tables/redshift/eff_sat_v0.sql @@ -41,20 +41,32 @@ WITH {# First, the current status for each hashkey is queried #} + + current_status_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ deleted_flag_alias }}, + {{ src_rsrc }}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + FROM {{ this }} + + ), + current_status AS ( SELECT {{ tracked_hashkey }}, {{ deleted_flag_alias }}, {{ src_rsrc }} - FROM {{ this }} - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 + FROM current_status_prep + WHERE rn = 1 ), {% for source_model in source_models -%} - {# + {# For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. #} {%- set source_number = source_model.id | string -%} @@ -133,6 +145,18 @@ WITH ), + new_hashkeys_union_dedupe_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + FROM new_hashkeys_union + + ), + new_hashkeys_union_dedupe AS ( SELECT @@ -141,11 +165,11 @@ WITH {{ src_ldts }}, {{ deleted_flag_alias }} FROM new_hashkeys_union_dedupe_prep - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + WHERE rn = 1 {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} - ), + ), {%- endif %} @@ -219,6 +243,18 @@ WITH ), + hashkey_union_dedupe_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + FROM hashkeys_union + + ), + hashkey_union_dedupe AS ( SELECT @@ -227,7 +263,7 @@ WITH {{ src_ldts }}, {{ deleted_flag_alias }} FROM hashkey_union_dedupe_prep - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + WHERE rn = 1 {%- set ns.last_cte = 'hashkey_union_dedupe' -%} @@ -251,6 +287,3 @@ SELECT FROM records_to_insert {%- endmacro -%} - - - diff --git a/macros/tables/snowflake/eff_sat_v0.sql b/macros/tables/snowflake/eff_sat_v0.sql index 2ea2f94f..5de8a17a 100644 --- a/macros/tables/snowflake/eff_sat_v0.sql +++ b/macros/tables/snowflake/eff_sat_v0.sql @@ -140,7 +140,7 @@ WITH {{ src_rsrc }}, {{ src_ldts }}, {{ deleted_flag_alias }} - FROM new_hashkeys_union_dedupe_prep + FROM new_hashkeys_union QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} @@ -226,7 +226,7 @@ WITH {{ src_rsrc }}, {{ src_ldts }}, {{ deleted_flag_alias }} - FROM hashkey_union_dedupe_prep + FROM hashkeys_union QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 {%- set ns.last_cte = 'hashkey_union_dedupe' -%} @@ -251,6 +251,3 @@ SELECT FROM records_to_insert {%- endmacro -%} - - - diff --git a/macros/tables/synapse/eff_sat_v0.sql b/macros/tables/synapse/eff_sat_v0.sql index 6e0fbf83..7790507d 100644 --- a/macros/tables/synapse/eff_sat_v0.sql +++ b/macros/tables/synapse/eff_sat_v0.sql @@ -41,20 +41,31 @@ WITH {# First, the current status for each hashkey is queried #} + current_status_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ deleted_flag_alias }}, + {{ src_rsrc }}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + FROM {{ this }} + + ), + current_status AS ( SELECT {{ tracked_hashkey }}, {{ deleted_flag_alias }}, {{ src_rsrc }} - FROM {{ this }} - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 + FROM current_status_prep + WHERE rn = 1 ), {% for source_model in source_models -%} - {# + {# For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. #} {%- set source_number = source_model.id | string -%} @@ -133,6 +144,18 @@ WITH ), + new_hashkeys_union_dedupe_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + FROM new_hashkeys_union + + ), + new_hashkeys_union_dedupe AS ( SELECT @@ -141,11 +164,11 @@ WITH {{ src_ldts }}, {{ deleted_flag_alias }} FROM new_hashkeys_union_dedupe_prep - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + WHERE rn = 1 {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} - ), + ), {%- endif %} @@ -219,6 +242,18 @@ WITH ), + hashkey_union_dedupe_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_rsrc }}, + {{ src_ldts }}, + {{ deleted_flag_alias }}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + FROM hashkeys_union + + ), + hashkey_union_dedupe AS ( SELECT @@ -227,7 +262,7 @@ WITH {{ src_ldts }}, {{ deleted_flag_alias }} FROM hashkey_union_dedupe_prep - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + WHERE rn = 1 {%- set ns.last_cte = 'hashkey_union_dedupe' -%} @@ -251,6 +286,3 @@ SELECT FROM records_to_insert {%- endmacro -%} - - - From 4fa65a34f39b53930895da5040f1f269db9e116c Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Fri, 13 Sep 2024 10:57:17 +0200 Subject: [PATCH 07/22] change ldts of first appearing HKs from current_timestamp to the source ldts --- macros/tables/bigquery/eff_sat_v0.sql | 4 ++-- macros/tables/exasol/eff_sat_v0.sql | 4 ++-- macros/tables/postgres/eff_sat_v0.sql | 4 ++-- macros/tables/redshift/eff_sat_v0.sql | 4 ++-- macros/tables/snowflake/eff_sat_v0.sql | 4 ++-- macros/tables/synapse/eff_sat_v0.sql | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/macros/tables/bigquery/eff_sat_v0.sql b/macros/tables/bigquery/eff_sat_v0.sql index 0426156d..30833ea2 100644 --- a/macros/tables/bigquery/eff_sat_v0.sql +++ b/macros/tables/bigquery/eff_sat_v0.sql @@ -65,7 +65,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src LEFT JOIN current_status cs @@ -185,7 +185,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src diff --git a/macros/tables/exasol/eff_sat_v0.sql b/macros/tables/exasol/eff_sat_v0.sql index 8f119f17..dddf6369 100644 --- a/macros/tables/exasol/eff_sat_v0.sql +++ b/macros/tables/exasol/eff_sat_v0.sql @@ -65,7 +65,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src LEFT JOIN current_status cs @@ -185,7 +185,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src diff --git a/macros/tables/postgres/eff_sat_v0.sql b/macros/tables/postgres/eff_sat_v0.sql index 5883eb9a..004dd23b 100644 --- a/macros/tables/postgres/eff_sat_v0.sql +++ b/macros/tables/postgres/eff_sat_v0.sql @@ -77,7 +77,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src LEFT JOIN current_status cs @@ -209,7 +209,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src diff --git a/macros/tables/redshift/eff_sat_v0.sql b/macros/tables/redshift/eff_sat_v0.sql index b89f99fe..908d3fdf 100644 --- a/macros/tables/redshift/eff_sat_v0.sql +++ b/macros/tables/redshift/eff_sat_v0.sql @@ -77,7 +77,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src LEFT JOIN current_status cs @@ -209,7 +209,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src diff --git a/macros/tables/snowflake/eff_sat_v0.sql b/macros/tables/snowflake/eff_sat_v0.sql index 5de8a17a..dc2b4c52 100644 --- a/macros/tables/snowflake/eff_sat_v0.sql +++ b/macros/tables/snowflake/eff_sat_v0.sql @@ -65,7 +65,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src LEFT JOIN current_status cs @@ -185,7 +185,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src diff --git a/macros/tables/synapse/eff_sat_v0.sql b/macros/tables/synapse/eff_sat_v0.sql index 7790507d..45f413a4 100644 --- a/macros/tables/synapse/eff_sat_v0.sql +++ b/macros/tables/synapse/eff_sat_v0.sql @@ -76,7 +76,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src LEFT JOIN current_status cs @@ -208,7 +208,7 @@ WITH SELECT DISTINCT src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, src.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, + src.{{ src_ldts }}, 0 as {{ deleted_flag_alias }} FROM {{ ref(source_model.name) }} src From af155d011431c7041b93f58373cf20c1b9343372 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Fri, 13 Sep 2024 15:19:54 +0200 Subject: [PATCH 08/22] fix records_to_inserts for multi models the records_to_insert cte did select from the wrong new hashkey cte --- macros/tables/bigquery/eff_sat_v0.sql | 4 ++-- macros/tables/exasol/eff_sat_v0.sql | 4 ++-- macros/tables/postgres/eff_sat_v0.sql | 2 +- macros/tables/redshift/eff_sat_v0.sql | 2 +- macros/tables/snowflake/eff_sat_v0.sql | 4 ++-- macros/tables/synapse/eff_sat_v0.sql | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/macros/tables/bigquery/eff_sat_v0.sql b/macros/tables/bigquery/eff_sat_v0.sql index 30833ea2..2f4bb035 100644 --- a/macros/tables/bigquery/eff_sat_v0.sql +++ b/macros/tables/bigquery/eff_sat_v0.sql @@ -54,7 +54,7 @@ WITH {% for source_model in source_models -%} - {# + {# For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. #} {%- set source_number = source_model.id | string -%} @@ -143,7 +143,7 @@ WITH FROM new_hashkeys_union QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 - {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} ), diff --git a/macros/tables/exasol/eff_sat_v0.sql b/macros/tables/exasol/eff_sat_v0.sql index dddf6369..cb1915ba 100644 --- a/macros/tables/exasol/eff_sat_v0.sql +++ b/macros/tables/exasol/eff_sat_v0.sql @@ -54,7 +54,7 @@ WITH {% for source_model in source_models -%} - {# + {# For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. #} {%- set source_number = source_model.id | string -%} @@ -143,7 +143,7 @@ WITH FROM new_hashkeys_union QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 - {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} ), diff --git a/macros/tables/postgres/eff_sat_v0.sql b/macros/tables/postgres/eff_sat_v0.sql index 004dd23b..a2f8b6ee 100644 --- a/macros/tables/postgres/eff_sat_v0.sql +++ b/macros/tables/postgres/eff_sat_v0.sql @@ -167,7 +167,7 @@ WITH FROM new_hashkeys_union_dedupe_prep WHERE rn = 1 - {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} ), diff --git a/macros/tables/redshift/eff_sat_v0.sql b/macros/tables/redshift/eff_sat_v0.sql index 908d3fdf..bd8d122f 100644 --- a/macros/tables/redshift/eff_sat_v0.sql +++ b/macros/tables/redshift/eff_sat_v0.sql @@ -167,7 +167,7 @@ WITH FROM new_hashkeys_union_dedupe_prep WHERE rn = 1 - {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} ), diff --git a/macros/tables/snowflake/eff_sat_v0.sql b/macros/tables/snowflake/eff_sat_v0.sql index dc2b4c52..4965ed0d 100644 --- a/macros/tables/snowflake/eff_sat_v0.sql +++ b/macros/tables/snowflake/eff_sat_v0.sql @@ -54,7 +54,7 @@ WITH {% for source_model in source_models -%} - {# + {# For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. #} {%- set source_number = source_model.id | string -%} @@ -143,7 +143,7 @@ WITH FROM new_hashkeys_union QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 - {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} ), diff --git a/macros/tables/synapse/eff_sat_v0.sql b/macros/tables/synapse/eff_sat_v0.sql index 45f413a4..4fd54205 100644 --- a/macros/tables/synapse/eff_sat_v0.sql +++ b/macros/tables/synapse/eff_sat_v0.sql @@ -166,7 +166,7 @@ WITH FROM new_hashkeys_union_dedupe_prep WHERE rn = 1 - {%- set ns.last_cte = 'new_hashkeys_union_dedupe' -%} + {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} ), From f1e623b77da2617e6ed62b133d22381f5d70d86d Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Fri, 20 Sep 2024 08:52:50 +0200 Subject: [PATCH 09/22] incorporate improvements from snowflake for all adapters, replace first qualify with prep cte --- macros/tables/bigquery/eff_sat_v0.sql | 403 ++++++++++++---------- macros/tables/eff_sat_v0.sql | 10 +- macros/tables/exasol/eff_sat_v0.sql | 403 ++++++++++++---------- macros/tables/postgres/eff_sat_v0.sql | 443 +++++++++++++------------ macros/tables/redshift/eff_sat_v0.sql | 440 ++++++++++++------------ macros/tables/snowflake/eff_sat_v0.sql | 403 ++++++++++++---------- macros/tables/synapse/eff_sat_v0.sql | 439 ++++++++++++------------ 7 files changed, 1349 insertions(+), 1192 deletions(-) diff --git a/macros/tables/bigquery/eff_sat_v0.sql b/macros/tables/bigquery/eff_sat_v0.sql index 2f4bb035..264fbb2a 100644 --- a/macros/tables/bigquery/eff_sat_v0.sql +++ b/macros/tables/bigquery/eff_sat_v0.sql @@ -1,27 +1,14 @@ -{%- macro default__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} +{%- macro default__eff_sat_v0(source_model, tracked_hashkey, src_ldts, src_rsrc, is_active_alias, source_is_single_batch, disable_hwm) -%} {%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} {%- set timestamp_format = datavault4dbt.timestamp_format() -%} -{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} +{%- set ns = namespace(last_cte= "") -%} -{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} -{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} -{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} - {%- set source_models = {source_models: {}} -%} -{%- endif -%} +{%- set source_relation = ref(source_model) -%} -{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} -{%- set source_models = source_model_values['source_model_list'] -%} -{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} -{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} -{{ log('source_models: '~source_models, false) }} - -{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} - -{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} {%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} -{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set is_active_alias = datavault4dbt.escape_column_names(is_active_alias) -%} {%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} {%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} @@ -30,224 +17,276 @@ {{ datavault4dbt.prepend_generated_by() }} WITH + {# - For incremental runs, three different cases can occur: - hashkey disappeared -> deleted_flag = 1 - hashkey reappeared -> deleted_flag = 0 - new hashkey appeared -> deleted_flag = 0 + In all cases, the source model is selected, and optionally a HWM is applied. #} -{% if is_incremental() and execute %} - - {# - First, the current status for each hashkey is queried - #} - current_status AS ( - +source_data AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }} + FROM {{ source_relation }} src + WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') + {%- if is_incremental() and not disable_hwm %} + AND src.{{ src_ldts }} > ( SELECT - {{ tracked_hashkey }}, - {{ deleted_flag_alias }}, - {{ src_rsrc }} + MAX({{ src_ldts }}) FROM {{ this }} - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 - - ), - - {% for source_model in source_models -%} - - {# - For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. - #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - new_hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - LEFT JOIN current_status cs - ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} - AND cs.{{ deleted_flag_alias }} = 0 - WHERE cs.{{ tracked_hashkey }} IS NULL - - {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} - - {# - All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. - If they can't be found anywhere, they are marked as deleted_flag = 1. - #} - disappeared_hashkeys AS ( - - SELECT DISTINCT - cs.{{ tracked_hashkey }}, - cs.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, - 1 as {{ deleted_flag_alias }} - FROM current_status cs - WHERE - {% for source_model in source_models %} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - {{ 'AND' if not loop.first }} - NOT EXISTS ( - SELECT - 1 - FROM {{ ref(source_model.name) }} src - WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} - ) - {% endfor %} - AND cs.{{ deleted_flag_alias }} = 0 + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} + ) + {%- endif %} +), - ), +{# + In all incremental cases, the current status for each hashkey is selected from the existing Effectivity Satellite. +#} +{%- if is_incremental() %} +current_status AS ( + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias }} + FROM {{ this }} + QUALIFY + ROW_NUMBER() OVER(PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) = 1 - {%- if source_models | length > 1 -%} +), +{% endif %} - {# - If more then one source_model is defined, the new hashkeys of all source_models are unioned. - #} - new_hashkeys_union AS ( +{# + This block is for multi-batch processing. +#} +{% if not source_is_single_batch %} - {%- for source_model in source_models -%} + {# + List of all Hashkeys with their date of first appearance in the source model. + #} + hashkeys AS ( - {%- set source_number = source_model.id | string -%} + SELECT + {{ tracked_hashkey }}, + MIN({{ src_ldts }}) as first_appearance + FROM source_data + GROUP BY {{ tracked_hashkey }} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_{{ source_number }} + ), - {%- if not loop.last %} - UNION ALL - {% endif -%} + {# + Distinct list of load dates in the multi-batch source. + #} + load_dates AS ( - {%- endfor -%} + SELECT Distinct + {{ src_ldts }} + FROM source_data + + ), - ), + {# + All combinations of hashkeys and loaddates, for loaddates after the first appearance of a hashkey. + #} + history AS ( - new_hashkeys_union_dedupe AS ( + SELECT + hk.{{ tracked_hashkey }}, + ld.{{ src_ldts }} + FROM hashkeys hk + CROSS JOIN load_dates ld + WHERE ld.{{ src_ldts }} >= hk.first_appearance - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_union - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + ), - {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} + {# + All theoretical combinations are checked against the actual occurences of hashkeys in each batch / loaddate. + If a Hashkey is part of a load/batch, is_active_alias is set to 1, because the hashkey was active in that load/batch. + If a Hashkey is not part of a load/batch, is_active_alias is set to 0, because the hashkey was not active in that load/batch. + #} + is_active AS ( - ), + SELECT + h.{{ tracked_hashkey }}, + h.{{ src_ldts }}, + CASE + WHEN src.{{ tracked_hashkey }} IS NULL THEN 0 + ELSE 1 + END as {{ is_active_alias }} + FROM history h + LEFT JOIN source_data src + ON src.{{ tracked_hashkey }} = h.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = h.{{ src_ldts }} - {%- endif %} + ), {# - All hashkeys that have a status change should be inserted. - That includes new, reappeared, and disappeared hashkeys. + The rows are deduplicated on the is_active_alias, to only include status changes. + Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - records_to_insert AS ( + deduplicated_incoming AS ( SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.new_hashkeys_cte }} + is_active.{{ tracked_hashkey }}, + is_active.{{ src_ldts }}, + is_active.{{ is_active_alias }} - UNION ALL + {% if is_incremental() -%} + , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn + {%- endif %} - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM disappeared_hashkeys + FROM is_active + QUALIFY + CASE + WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END - ) + ), + + {% set ns.last_cte = 'deduplicated_incoming' %} -{%- else %} +{# + This block is for single-batch processing +#} +{% else %} {# - In initial runs, every available hashkey is marked as deleted_flag = 0. + In initial loads of single-batch eff sats, every hashkey of the source is set to active. #} - {% for source_model in source_models %} + new_hashkeys AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey }}, + src.{{ src_ldts }}, + 1 as {{ is_active_alias }} + FROM source_data src {# - When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + For incremental runs of single-batch eff sats, only hashkeys that are not active right now are set to active. + This automatically includes totally new hashkeys, or hashkeys that are currently set to inactive. #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - - {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} + {% if is_incremental() %} + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ is_active_alias }} = 1 + WHERE cs.{{ tracked_hashkey }} IS NULL + {% endif %} - {% if source_models | length > 1 -%} + ), - hashkeys_union AS ( + {% set ns.last_cte = 'new_hashkeys' %} - {%- for source_model in source_models -%} +{% endif %} - {%- set source_number = source_model.id | string -%} +{# + In all incremental runs, the source needs to be scanned for all currently active hashkeys. + If they are no longer present, they will be deactived. +#} +{%- if is_incremental() %} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkeys_{{ source_number }} + {%- if not source_is_single_batch %} + disappeared_hashkeys AS ( - {%- if not loop.last %} - UNION ALL - {% endif -%} + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM deduplicated_incoming) ldts + ON 1 = 1 + LEFT JOIN deduplicated_incoming src + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = ldts.min_ldts + WHERE + cs.{{ is_active_alias }} = 1 + AND src.{{ tracked_hashkey }} IS NULL + AND ldts.min_ldts IS NOT NULL - {%- endfor -%} + ), + {% else %} + disappeared_hashkeys AS ( + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM source_data) ldts + ON 1 = 1 + WHERE NOT EXISTS ( + SELECT + 1 + FROM source_data src + WHERE src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + ) + AND cs.{{ is_active_alias }} = 1 + AND ldts.min_ldts IS NOT NULL ), + {% endif %} +{%- endif %} - hashkey_union_dedupe AS ( +records_to_insert AS ( - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkeys_union - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + {# + This first part of the UNION includes: + - for single-batch loads: Only is_active_alias = 1, deactivations are handled later + - for multi-batch loads: Ativation and deactivation inside the multiple loads + #} + SELECT + di.{{ tracked_hashkey }}, + di.{{ src_ldts }}, + di.{{ is_active_alias }} + FROM {{ ns.last_cte }} di - {%- set ns.last_cte = 'hashkey_union_dedupe' -%} - ), + {%- if is_incremental() %} - {%- endif %} + {# + For incremental multi-batch loads, the earliest to-be inserted status is compared to the current status. + It will only be inserted if the status changed. We use the ROW_NUMBER() + #} + {%- if not source_is_single_batch %} + WHERE NOT EXISTS ( + SELECT 1 + FROM current_status + WHERE {{ datavault4dbt.multikey(tracked_hashkey, prefix=['current_status', 'di'], condition='=') }} + AND {{ datavault4dbt.multikey(is_active_alias, prefix=['current_status', 'di'], condition='=') }} + AND di.{{ src_ldts }} = (SELECT MIN({{ src_ldts }}) FROM deduplicated_incoming) + ) + AND di.{{ src_ldts }} > (SELECT MAX({{ src_ldts }}) FROM {{ this }}) + {% endif %} - records_to_insert AS ( + {# + For all incremental loads, the disappeared hashkeys are UNIONed. + #} + UNION - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.last_cte }} + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }}, + {{ is_active_alias }} + FROM disappeared_hashkeys - ) + {%- endif %} +) -{% endif %} +SELECT * +FROM records_to_insert ri -SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} -FROM records_to_insert +{% if is_incremental() %} +WHERE NOT EXISTS ( + SELECT 1 + FROM {{ this }} t + WHERE t.{{ tracked_hashkey }} = ri.{{ tracked_hashkey }} + AND t.{{ src_ldts }} = ri.{{ src_ldts }} +) +{% endif %} -{%- endmacro -%} +{%- endmacro -%} \ No newline at end of file diff --git a/macros/tables/eff_sat_v0.sql b/macros/tables/eff_sat_v0.sql index af9f10ed..c5569bc9 100644 --- a/macros/tables/eff_sat_v0.sql +++ b/macros/tables/eff_sat_v0.sql @@ -1,14 +1,16 @@ -{%- macro eff_sat_v0(source_models, tracked_hashkey, src_ldts=none, src_rsrc=none, deleted_flag_alias=none) -%} +{%- macro eff_sat_v0(source_model, tracked_hashkey, src_ldts=none, src_rsrc=none, is_active_alias=none, source_is_single_batch=true, disable_hwm=false) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} {%- set src_rsrc = datavault4dbt.replace_standard(src_rsrc, 'datavault4dbt.rsrc_alias', 'rsrc') -%} - {%- set deleted_flag_alias = datavault4dbt.replace_standard(deleted_flag_alias, 'datavault4dbt.deleted_flag_alias', 'deleted_flag') -%} + {%- set is_active_alias = datavault4dbt.replace_standard(is_active_alias, 'datavault4dbt.is_active_alias', 'deleted_flag') -%} {{ return(adapter.dispatch('eff_sat_v0', 'datavault4dbt')(tracked_hashkey=tracked_hashkey, src_ldts=src_ldts, src_rsrc=src_rsrc, - deleted_flag_alias=deleted_flag_alias, - source_models=source_models) ) + is_active_alias=is_active_alias, + source_model=source_model, + source_is_single_batch=source_is_single_batch, + disable_hwm=disable_hwm) ) }} {%- endmacro -%} \ No newline at end of file diff --git a/macros/tables/exasol/eff_sat_v0.sql b/macros/tables/exasol/eff_sat_v0.sql index cb1915ba..7640ec81 100644 --- a/macros/tables/exasol/eff_sat_v0.sql +++ b/macros/tables/exasol/eff_sat_v0.sql @@ -1,27 +1,14 @@ -{%- macro exasol__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} +{%- macro exasol__eff_sat_v0(source_model, tracked_hashkey, src_ldts, src_rsrc, is_active_alias, source_is_single_batch, disable_hwm) -%} {%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} {%- set timestamp_format = datavault4dbt.timestamp_format() -%} -{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} +{%- set ns = namespace(last_cte= "") -%} -{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} -{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} -{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} - {%- set source_models = {source_models: {}} -%} -{%- endif -%} +{%- set source_relation = ref(source_model) -%} -{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} -{%- set source_models = source_model_values['source_model_list'] -%} -{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} -{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} -{{ log('source_models: '~source_models, false) }} - -{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} - -{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} {%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} -{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set is_active_alias = datavault4dbt.escape_column_names(is_active_alias) -%} {%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} {%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} @@ -30,224 +17,276 @@ {{ datavault4dbt.prepend_generated_by() }} WITH + {# - For incremental runs, three different cases can occur: - hashkey disappeared -> deleted_flag = 1 - hashkey reappeared -> deleted_flag = 0 - new hashkey appeared -> deleted_flag = 0 + In all cases, the source model is selected, and optionally a HWM is applied. #} -{% if is_incremental() and execute %} - - {# - First, the current status for each hashkey is queried - #} - current_status AS ( - +source_data AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }} + FROM {{ source_relation }} src + WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') + {%- if is_incremental() and not disable_hwm %} + AND src.{{ src_ldts }} > ( SELECT - {{ tracked_hashkey }}, - {{ deleted_flag_alias }}, - {{ src_rsrc }} + MAX({{ src_ldts }}) FROM {{ this }} - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 - - ), - - {% for source_model in source_models -%} - - {# - For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. - #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - new_hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - LEFT JOIN current_status cs - ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} - AND cs.{{ deleted_flag_alias }} = 0 - WHERE cs.{{ tracked_hashkey }} IS NULL - - {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} - - {# - All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. - If they can't be found anywhere, they are marked as deleted_flag = 1. - #} - disappeared_hashkeys AS ( - - SELECT DISTINCT - cs.{{ tracked_hashkey }}, - cs.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, - 1 as {{ deleted_flag_alias }} - FROM current_status cs - WHERE - {% for source_model in source_models %} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - {{ 'AND' if not loop.first }} - NOT EXISTS ( - SELECT - 1 - FROM {{ ref(source_model.name) }} src - WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} - ) - {% endfor %} - AND cs.{{ deleted_flag_alias }} = 0 + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} + ) + {%- endif %} +), - ), +{# + In all incremental cases, the current status for each hashkey is selected from the existing Effectivity Satellite. +#} +{%- if is_incremental() %} +current_status AS ( + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias }} + FROM {{ this }} + QUALIFY + ROW_NUMBER() OVER(PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) = 1 - {%- if source_models | length > 1 -%} +), +{% endif %} - {# - If more then one source_model is defined, the new hashkeys of all source_models are unioned. - #} - new_hashkeys_union AS ( +{# + This block is for multi-batch processing. +#} +{% if not source_is_single_batch %} - {%- for source_model in source_models -%} + {# + List of all Hashkeys with their date of first appearance in the source model. + #} + hashkeys AS ( - {%- set source_number = source_model.id | string -%} + SELECT + {{ tracked_hashkey }}, + MIN({{ src_ldts }}) as first_appearance + FROM source_data + GROUP BY {{ tracked_hashkey }} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_{{ source_number }} + ), - {%- if not loop.last %} - UNION - {% endif -%} + {# + Distinct list of load dates in the multi-batch source. + #} + load_dates AS ( - {%- endfor -%} + SELECT Distinct + {{ src_ldts }} + FROM source_data + + ), - ), + {# + All combinations of hashkeys and loaddates, for loaddates after the first appearance of a hashkey. + #} + history AS ( - new_hashkeys_union_dedupe AS ( + SELECT + hk.{{ tracked_hashkey }}, + ld.{{ src_ldts }} + FROM hashkeys hk + CROSS JOIN load_dates ld + WHERE ld.{{ src_ldts }} >= hk.first_appearance - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_union - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + ), - {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} + {# + All theoretical combinations are checked against the actual occurences of hashkeys in each batch / loaddate. + If a Hashkey is part of a load/batch, is_active_alias is set to 1, because the hashkey was active in that load/batch. + If a Hashkey is not part of a load/batch, is_active_alias is set to 0, because the hashkey was not active in that load/batch. + #} + is_active AS ( - ), + SELECT + h.{{ tracked_hashkey }}, + h.{{ src_ldts }}, + CASE + WHEN src.{{ tracked_hashkey }} IS NULL THEN 0 + ELSE 1 + END as {{ is_active_alias }} + FROM history h + LEFT JOIN source_data src + ON src.{{ tracked_hashkey }} = h.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = h.{{ src_ldts }} - {%- endif %} + ), {# - All hashkeys that have a status change should be inserted. - That includes new, reappeared, and disappeared hashkeys. + The rows are deduplicated on the is_active_alias, to only include status changes. + Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - records_to_insert AS ( + deduplicated_incoming AS ( SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.new_hashkeys_cte }} + is_active.{{ tracked_hashkey }}, + is_active.{{ src_ldts }}, + is_active.{{ is_active_alias }} - UNION + {% if is_incremental() -%} + , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn + {%- endif %} - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM disappeared_hashkeys + FROM is_active + QUALIFY + CASE + WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END - ) + ), + + {% set ns.last_cte = 'deduplicated_incoming' %} -{%- else %} +{# + This block is for single-batch processing +#} +{% else %} {# - In initial runs, every available hashkey is marked as deleted_flag = 0. + In initial loads of single-batch eff sats, every hashkey of the source is set to active. #} - {% for source_model in source_models %} + new_hashkeys AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey }}, + src.{{ src_ldts }}, + 1 as {{ is_active_alias }} + FROM source_data src {# - When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + For incremental runs of single-batch eff sats, only hashkeys that are not active right now are set to active. + This automatically includes totally new hashkeys, or hashkeys that are currently set to inactive. #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - - {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} + {% if is_incremental() %} + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ is_active_alias }} = 1 + WHERE cs.{{ tracked_hashkey }} IS NULL + {% endif %} - {% if source_models | length > 1 -%} + ), - hashkeys_union AS ( + {% set ns.last_cte = 'new_hashkeys' %} - {%- for source_model in source_models -%} +{% endif %} - {%- set source_number = source_model.id | string -%} +{# + In all incremental runs, the source needs to be scanned for all currently active hashkeys. + If they are no longer present, they will be deactived. +#} +{%- if is_incremental() %} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkeys_{{ source_number }} + {%- if not source_is_single_batch %} + disappeared_hashkeys AS ( - {%- if not loop.last %} - UNION - {% endif -%} + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM deduplicated_incoming) ldts + ON 1 = 1 + LEFT JOIN deduplicated_incoming src + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = ldts.min_ldts + WHERE + cs.{{ is_active_alias }} = 1 + AND src.{{ tracked_hashkey }} IS NULL + AND ldts.min_ldts IS NOT NULL - {%- endfor -%} + ), + {% else %} + disappeared_hashkeys AS ( + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM source_data) ldts + ON 1 = 1 + WHERE NOT EXISTS ( + SELECT + 1 + FROM source_data src + WHERE src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + ) + AND cs.{{ is_active_alias }} = 1 + AND ldts.min_ldts IS NOT NULL ), + {% endif %} +{%- endif %} - hashkey_union_dedupe AS ( +records_to_insert AS ( - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkeys_union - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + {# + This first part of the UNION includes: + - for single-batch loads: Only is_active_alias = 1, deactivations are handled later + - for multi-batch loads: Ativation and deactivation inside the multiple loads + #} + SELECT + di.{{ tracked_hashkey }}, + di.{{ src_ldts }}, + di.{{ is_active_alias }} + FROM {{ ns.last_cte }} di - {%- set ns.last_cte = 'hashkey_union_dedupe' -%} - ), + {%- if is_incremental() %} - {%- endif %} + {# + For incremental multi-batch loads, the earliest to-be inserted status is compared to the current status. + It will only be inserted if the status changed. We use the ROW_NUMBER() + #} + {%- if not source_is_single_batch %} + WHERE NOT EXISTS ( + SELECT 1 + FROM current_status + WHERE {{ datavault4dbt.multikey(tracked_hashkey, prefix=['current_status', 'di'], condition='=') }} + AND {{ datavault4dbt.multikey(is_active_alias, prefix=['current_status', 'di'], condition='=') }} + AND di.{{ src_ldts }} = (SELECT MIN({{ src_ldts }}) FROM deduplicated_incoming) + ) + AND di.{{ src_ldts }} > (SELECT MAX({{ src_ldts }}) FROM {{ this }}) + {% endif %} - records_to_insert AS ( + {# + For all incremental loads, the disappeared hashkeys are UNIONed. + #} + UNION - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.last_cte }} + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }}, + {{ is_active_alias }} + FROM disappeared_hashkeys - ) + {%- endif %} +) -{% endif %} +SELECT * +FROM records_to_insert ri -SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} -FROM records_to_insert +{% if is_incremental() %} +WHERE NOT EXISTS ( + SELECT 1 + FROM {{ this }} t + WHERE t.{{ tracked_hashkey }} = ri.{{ tracked_hashkey }} + AND t.{{ src_ldts }} = ri.{{ src_ldts }} +) +{% endif %} -{%- endmacro -%} +{%- endmacro -%} \ No newline at end of file diff --git a/macros/tables/postgres/eff_sat_v0.sql b/macros/tables/postgres/eff_sat_v0.sql index a2f8b6ee..dcad92a7 100644 --- a/macros/tables/postgres/eff_sat_v0.sql +++ b/macros/tables/postgres/eff_sat_v0.sql @@ -1,27 +1,14 @@ -{%- macro postgres__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} +{%- macro postgres__eff_sat_v0(source_model, tracked_hashkey, src_ldts, src_rsrc, is_active_alias, source_is_single_batch, disable_hwm) -%} {%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} {%- set timestamp_format = datavault4dbt.timestamp_format() -%} -{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} +{%- set ns = namespace(last_cte= "") -%} -{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} -{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} -{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} - {%- set source_models = {source_models: {}} -%} -{%- endif -%} +{%- set source_relation = ref(source_model) -%} -{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} -{%- set source_models = source_model_values['source_model_list'] -%} -{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} -{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} -{{ log('source_models: '~source_models, false) }} - -{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} - -{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} {%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} -{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set is_active_alias = datavault4dbt.escape_column_names(is_active_alias) -%} {%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} {%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} @@ -30,260 +17,286 @@ {{ datavault4dbt.prepend_generated_by() }} WITH + {# - For incremental runs, three different cases can occur: - hashkey disappeared -> deleted_flag = 1 - hashkey reappeared -> deleted_flag = 0 - new hashkey appeared -> deleted_flag = 0 + In all cases, the source model is selected, and optionally a HWM is applied. #} -{% if is_incremental() and execute %} - - {# - First, the current status for each hashkey is queried - #} - - current_status_prep AS ( - +source_data AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }} + FROM {{ source_relation }} src + WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') + {%- if is_incremental() and not disable_hwm %} + AND src.{{ src_ldts }} > ( SELECT - {{ tracked_hashkey }}, - {{ deleted_flag_alias }}, - {{ src_rsrc }}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + MAX({{ src_ldts }}) FROM {{ this }} + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} + ) + {%- endif %} +), - ), - - current_status AS ( - - SELECT - {{ tracked_hashkey }}, - {{ deleted_flag_alias }}, - {{ src_rsrc }} - FROM current_status_prep - WHERE rn = 1 - - ), - - {% for source_model in source_models -%} - - {# - For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. - #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - new_hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - LEFT JOIN current_status cs - ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} - AND cs.{{ deleted_flag_alias }} = 0 - WHERE cs.{{ tracked_hashkey }} IS NULL - - {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} - - {# - All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. - If they can't be found anywhere, they are marked as deleted_flag = 1. - #} - disappeared_hashkeys AS ( - - SELECT DISTINCT - cs.{{ tracked_hashkey }}, - cs.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, - 1 as {{ deleted_flag_alias }} - FROM current_status cs - WHERE - {% for source_model in source_models %} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - {{ 'AND' if not loop.first }} - NOT EXISTS ( - SELECT - 1 - FROM {{ ref(source_model.name) }} src - WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} - ) - {% endfor %} - AND cs.{{ deleted_flag_alias }} = 0 +{# + In all incremental cases, the current status for each hashkey is selected from the existing Effectivity Satellite. +#} +{%- if is_incremental() %} +current_status_prep AS ( - ), + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias}}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) as rn + FROM {{ this }} +), - {%- if source_models | length > 1 -%} +current_status AS ( - {# - If more then one source_model is defined, the new hashkeys of all source_models are unioned. - #} - new_hashkeys_union AS ( + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias }} + FROM current_status_prep + WHERE rn = 1 - {%- for source_model in source_models -%} +), +{% endif %} - {%- set source_number = source_model.id | string -%} +{# + This block is for multi-batch processing. +#} +{% if not source_is_single_batch %} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_{{ source_number }} + {# + List of all Hashkeys with their date of first appearance in the source model. + #} + hashkeys AS ( - {%- if not loop.last %} - UNION - {% endif -%} + SELECT + {{ tracked_hashkey }}, + MIN({{ src_ldts }}) as first_appearance + FROM source_data + GROUP BY {{ tracked_hashkey }} - {%- endfor -%} + ), - ), + {# + Distinct list of load dates in the multi-batch source. + #} + load_dates AS ( - new_hashkeys_union_dedupe_prep AS ( + SELECT Distinct + {{ src_ldts }} + FROM source_data + + ), - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn - FROM new_hashkeys_union - - ), + {# + All combinations of hashkeys and loaddates, for loaddates after the first appearance of a hashkey. + #} + history AS ( - new_hashkeys_union_dedupe AS ( + SELECT + hk.{{ tracked_hashkey }}, + ld.{{ src_ldts }} + FROM hashkeys hk + CROSS JOIN load_dates ld + WHERE ld.{{ src_ldts }} >= hk.first_appearance - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_union_dedupe_prep - WHERE rn = 1 + ), - {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} + {# + All theoretical combinations are checked against the actual occurences of hashkeys in each batch / loaddate. + If a Hashkey is part of a load/batch, is_active_alias is set to 1, because the hashkey was active in that load/batch. + If a Hashkey is not part of a load/batch, is_active_alias is set to 0, because the hashkey was not active in that load/batch. + #} + is_active AS ( - ), + SELECT + h.{{ tracked_hashkey }}, + h.{{ src_ldts }}, + CASE + WHEN src.{{ tracked_hashkey }} IS NULL THEN 0 + ELSE 1 + END as {{ is_active_alias }} + FROM history h + LEFT JOIN source_data src + ON src.{{ tracked_hashkey }} = h.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = h.{{ src_ldts }} - {%- endif %} + ), {# - All hashkeys that have a status change should be inserted. - That includes new, reappeared, and disappeared hashkeys. + The rows are deduplicated on the is_active_alias, to only include status changes. + Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - records_to_insert AS ( + deduplicated_incoming AS ( SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.new_hashkeys_cte }} + is_active.{{ tracked_hashkey }}, + is_active.{{ src_ldts }}, + is_active.{{ is_active_alias }}, + LAG(is_active.{{ is_active_alias }} OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }})) as lag_is_active + + {% if is_incremental() -%} + , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn + {%- endif %} + + FROM is_active + QUALIFY + CASE + WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END - UNION - - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM disappeared_hashkeys + ), - ) + {% set ns.last_cte = 'deduplicated_incoming' %} -{%- else %} +{# + This block is for single-batch processing +#} +{% else %} {# - In initial runs, every available hashkey is marked as deleted_flag = 0. + In initial loads of single-batch eff sats, every hashkey of the source is set to active. #} - {% for source_model in source_models %} + new_hashkeys AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey }}, + src.{{ src_ldts }}, + 1 as {{ is_active_alias }} + FROM source_data src {# - When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + For incremental runs of single-batch eff sats, only hashkeys that are not active right now are set to active. + This automatically includes totally new hashkeys, or hashkeys that are currently set to inactive. #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - - {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} - - {% if source_models | length > 1 -%} - - hashkeys_union AS ( + {% if is_incremental() %} + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ is_active_alias }} = 1 + WHERE cs.{{ tracked_hashkey }} IS NULL + {% endif %} - {%- for source_model in source_models -%} + ), - {%- set source_number = source_model.id | string -%} + {% set ns.last_cte = 'new_hashkeys' %} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkeys_{{ source_number }} +{% endif %} - {%- if not loop.last %} - UNION - {% endif -%} +{# + In all incremental runs, the source needs to be scanned for all currently active hashkeys. + If they are no longer present, they will be deactived. +#} +{%- if is_incremental() %} - {%- endfor -%} + {%- if not source_is_single_batch %} + disappeared_hashkeys AS ( + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM deduplicated_incoming) ldts + ON 1 = 1 + LEFT JOIN deduplicated_incoming src + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = ldts.min_ldts + WHERE + cs.{{ is_active_alias }} = 1 + AND src.{{ tracked_hashkey }} IS NULL + AND ldts.min_ldts IS NOT NULL ), + {% else %} + disappeared_hashkeys AS ( - hashkey_union_dedupe_prep AS ( - - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn - FROM hashkeys_union + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM source_data) ldts + ON 1 = 1 + WHERE NOT EXISTS ( + SELECT + 1 + FROM source_data src + WHERE src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + ) + AND cs.{{ is_active_alias }} = 1 + AND ldts.min_ldts IS NOT NULL ), + {% endif %} +{%- endif %} - hashkey_union_dedupe AS ( +records_to_insert AS ( - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkey_union_dedupe_prep - WHERE rn = 1 + {# + This first part of the UNION includes: + - for single-batch loads: Only is_active_alias = 1, deactivations are handled later + - for multi-batch loads: Ativation and deactivation inside the multiple loads + #} + SELECT + di.{{ tracked_hashkey }}, + di.{{ src_ldts }}, + di.{{ is_active_alias }} + FROM {{ ns.last_cte }} di - {%- set ns.last_cte = 'hashkey_union_dedupe' -%} - ), + {%- if is_incremental() %} - {%- endif %} + {# + For incremental multi-batch loads, the earliest to-be inserted status is compared to the current status. + It will only be inserted if the status changed. We use the ROW_NUMBER() + #} + {%- if not source_is_single_batch %} + WHERE NOT EXISTS ( + SELECT 1 + FROM current_status + WHERE {{ datavault4dbt.multikey(tracked_hashkey, prefix=['current_status', 'di'], condition='=') }} + AND {{ datavault4dbt.multikey(is_active_alias, prefix=['current_status', 'di'], condition='=') }} + AND di.{{ src_ldts }} = (SELECT MIN({{ src_ldts }}) FROM deduplicated_incoming) + ) + AND di.{{ src_ldts }} > (SELECT MAX({{ src_ldts }}) FROM {{ this }}) + {% endif %} - records_to_insert AS ( + {# + For all incremental loads, the disappeared hashkeys are UNIONed. + #} + UNION - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.last_cte }} + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }}, + {{ is_active_alias }} + FROM disappeared_hashkeys - ) + {%- endif %} +) -{% endif %} +SELECT * +FROM records_to_insert ri -SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} -FROM records_to_insert +{% if is_incremental() %} +WHERE NOT EXISTS ( + SELECT 1 + FROM {{ this }} t + WHERE t.{{ tracked_hashkey }} = ri.{{ tracked_hashkey }} + AND t.{{ src_ldts }} = ri.{{ src_ldts }} +) +{% endif %} -{%- endmacro -%} +{%- endmacro -%} \ No newline at end of file diff --git a/macros/tables/redshift/eff_sat_v0.sql b/macros/tables/redshift/eff_sat_v0.sql index bd8d122f..7133c700 100644 --- a/macros/tables/redshift/eff_sat_v0.sql +++ b/macros/tables/redshift/eff_sat_v0.sql @@ -1,27 +1,14 @@ -{%- macro redshift__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} +{%- macro redshift__eff_sat_v0(source_model, tracked_hashkey, src_ldts, src_rsrc, is_active_alias, source_is_single_batch, disable_hwm) -%} {%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} {%- set timestamp_format = datavault4dbt.timestamp_format() -%} -{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} +{%- set ns = namespace(last_cte= "") -%} -{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} -{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} -{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} - {%- set source_models = {source_models: {}} -%} -{%- endif -%} +{%- set source_relation = ref(source_model) -%} -{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} -{%- set source_models = source_model_values['source_model_list'] -%} -{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} -{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} -{{ log('source_models: '~source_models, false) }} - -{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} - -{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} {%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} -{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set is_active_alias = datavault4dbt.escape_column_names(is_active_alias) -%} {%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} {%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} @@ -30,260 +17,285 @@ {{ datavault4dbt.prepend_generated_by() }} WITH + {# - For incremental runs, three different cases can occur: - hashkey disappeared -> deleted_flag = 1 - hashkey reappeared -> deleted_flag = 0 - new hashkey appeared -> deleted_flag = 0 + In all cases, the source model is selected, and optionally a HWM is applied. #} -{% if is_incremental() and execute %} - - {# - First, the current status for each hashkey is queried - #} - - current_status_prep AS ( - +source_data AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }} + FROM {{ source_relation }} src + WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') + {%- if is_incremental() and not disable_hwm %} + AND src.{{ src_ldts }} > ( SELECT - {{ tracked_hashkey }}, - {{ deleted_flag_alias }}, - {{ src_rsrc }}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + MAX({{ src_ldts }}) FROM {{ this }} + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} + ) + {%- endif %} +), - ), - - current_status AS ( - - SELECT - {{ tracked_hashkey }}, - {{ deleted_flag_alias }}, - {{ src_rsrc }} - FROM current_status_prep - WHERE rn = 1 - - ), - - {% for source_model in source_models -%} - - {# - For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. - #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - new_hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - LEFT JOIN current_status cs - ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} - AND cs.{{ deleted_flag_alias }} = 0 - WHERE cs.{{ tracked_hashkey }} IS NULL - - {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} - - {# - All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. - If they can't be found anywhere, they are marked as deleted_flag = 1. - #} - disappeared_hashkeys AS ( - - SELECT DISTINCT - cs.{{ tracked_hashkey }}, - cs.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, - 1 as {{ deleted_flag_alias }} - FROM current_status cs - WHERE - {% for source_model in source_models %} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - {{ 'AND' if not loop.first }} - NOT EXISTS ( - SELECT - 1 - FROM {{ ref(source_model.name) }} src - WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} - ) - {% endfor %} - AND cs.{{ deleted_flag_alias }} = 0 +{# + In all incremental cases, the current status for each hashkey is selected from the existing Effectivity Satellite. +#} +{%- if is_incremental() %} +current_status_prep AS ( - ), + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias}}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) as rn + FROM {{ this }} +), - {%- if source_models | length > 1 -%} +current_status AS ( - {# - If more then one source_model is defined, the new hashkeys of all source_models are unioned. - #} - new_hashkeys_union AS ( + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias }} + FROM current_status_prep + WHERE rn = 1 - {%- for source_model in source_models -%} +), +{% endif %} - {%- set source_number = source_model.id | string -%} +{# + This block is for multi-batch processing. +#} +{% if not source_is_single_batch %} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_{{ source_number }} + {# + List of all Hashkeys with their date of first appearance in the source model. + #} + hashkeys AS ( - {%- if not loop.last %} - UNION - {% endif -%} + SELECT + {{ tracked_hashkey }}, + MIN({{ src_ldts }}) as first_appearance + FROM source_data + GROUP BY {{ tracked_hashkey }} - {%- endfor -%} + ), - ), + {# + Distinct list of load dates in the multi-batch source. + #} + load_dates AS ( - new_hashkeys_union_dedupe_prep AS ( + SELECT Distinct + {{ src_ldts }} + FROM source_data + + ), - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn - FROM new_hashkeys_union - - ), + {# + All combinations of hashkeys and loaddates, for loaddates after the first appearance of a hashkey. + #} + history AS ( - new_hashkeys_union_dedupe AS ( + SELECT + hk.{{ tracked_hashkey }}, + ld.{{ src_ldts }} + FROM hashkeys hk + CROSS JOIN load_dates ld + WHERE ld.{{ src_ldts }} >= hk.first_appearance - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_union_dedupe_prep - WHERE rn = 1 + ), - {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} + {# + All theoretical combinations are checked against the actual occurences of hashkeys in each batch / loaddate. + If a Hashkey is part of a load/batch, is_active_alias is set to 1, because the hashkey was active in that load/batch. + If a Hashkey is not part of a load/batch, is_active_alias is set to 0, because the hashkey was not active in that load/batch. + #} + is_active AS ( - ), + SELECT + h.{{ tracked_hashkey }}, + h.{{ src_ldts }}, + CASE + WHEN src.{{ tracked_hashkey }} IS NULL THEN 0 + ELSE 1 + END as {{ is_active_alias }} + FROM history h + LEFT JOIN source_data src + ON src.{{ tracked_hashkey }} = h.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = h.{{ src_ldts }} - {%- endif %} + ), {# - All hashkeys that have a status change should be inserted. - That includes new, reappeared, and disappeared hashkeys. + The rows are deduplicated on the is_active_alias, to only include status changes. + Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - records_to_insert AS ( + deduplicated_incoming AS ( SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.new_hashkeys_cte }} + is_active.{{ tracked_hashkey }}, + is_active.{{ src_ldts }}, + is_active.{{ is_active_alias }} - UNION + {% if is_incremental() -%} + , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn + {%- endif %} - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM disappeared_hashkeys + FROM is_active + QUALIFY + CASE + WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END - ) + ), + + {% set ns.last_cte = 'deduplicated_incoming' %} -{%- else %} +{# + This block is for single-batch processing +#} +{% else %} {# - In initial runs, every available hashkey is marked as deleted_flag = 0. + In initial loads of single-batch eff sats, every hashkey of the source is set to active. #} - {% for source_model in source_models %} + new_hashkeys AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey }}, + src.{{ src_ldts }}, + 1 as {{ is_active_alias }} + FROM source_data src {# - When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + For incremental runs of single-batch eff sats, only hashkeys that are not active right now are set to active. + This automatically includes totally new hashkeys, or hashkeys that are currently set to inactive. #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - - {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} - - {% if source_models | length > 1 -%} - - hashkeys_union AS ( + {% if is_incremental() %} + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ is_active_alias }} = 1 + WHERE cs.{{ tracked_hashkey }} IS NULL + {% endif %} - {%- for source_model in source_models -%} + ), - {%- set source_number = source_model.id | string -%} + {% set ns.last_cte = 'new_hashkeys' %} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkeys_{{ source_number }} +{% endif %} - {%- if not loop.last %} - UNION - {% endif -%} +{# + In all incremental runs, the source needs to be scanned for all currently active hashkeys. + If they are no longer present, they will be deactived. +#} +{%- if is_incremental() %} - {%- endfor -%} + {%- if not source_is_single_batch %} + disappeared_hashkeys AS ( + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM deduplicated_incoming) ldts + ON 1 = 1 + LEFT JOIN deduplicated_incoming src + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = ldts.min_ldts + WHERE + cs.{{ is_active_alias }} = 1 + AND src.{{ tracked_hashkey }} IS NULL + AND ldts.min_ldts IS NOT NULL ), + {% else %} + disappeared_hashkeys AS ( - hashkey_union_dedupe_prep AS ( - - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn - FROM hashkeys_union + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM source_data) ldts + ON 1 = 1 + WHERE NOT EXISTS ( + SELECT + 1 + FROM source_data src + WHERE src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + ) + AND cs.{{ is_active_alias }} = 1 + AND ldts.min_ldts IS NOT NULL ), + {% endif %} +{%- endif %} - hashkey_union_dedupe AS ( +records_to_insert AS ( - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkey_union_dedupe_prep - WHERE rn = 1 + {# + This first part of the UNION includes: + - for single-batch loads: Only is_active_alias = 1, deactivations are handled later + - for multi-batch loads: Ativation and deactivation inside the multiple loads + #} + SELECT + di.{{ tracked_hashkey }}, + di.{{ src_ldts }}, + di.{{ is_active_alias }} + FROM {{ ns.last_cte }} di - {%- set ns.last_cte = 'hashkey_union_dedupe' -%} - ), + {%- if is_incremental() %} - {%- endif %} + {# + For incremental multi-batch loads, the earliest to-be inserted status is compared to the current status. + It will only be inserted if the status changed. We use the ROW_NUMBER() + #} + {%- if not source_is_single_batch %} + WHERE NOT EXISTS ( + SELECT 1 + FROM current_status + WHERE {{ datavault4dbt.multikey(tracked_hashkey, prefix=['current_status', 'di'], condition='=') }} + AND {{ datavault4dbt.multikey(is_active_alias, prefix=['current_status', 'di'], condition='=') }} + AND di.{{ src_ldts }} = (SELECT MIN({{ src_ldts }}) FROM deduplicated_incoming) + ) + AND di.{{ src_ldts }} > (SELECT MAX({{ src_ldts }}) FROM {{ this }}) + {% endif %} - records_to_insert AS ( + {# + For all incremental loads, the disappeared hashkeys are UNIONed. + #} + UNION - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.last_cte }} + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }}, + {{ is_active_alias }} + FROM disappeared_hashkeys - ) + {%- endif %} +) -{% endif %} +SELECT * +FROM records_to_insert ri -SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} -FROM records_to_insert +{% if is_incremental() %} +WHERE NOT EXISTS ( + SELECT 1 + FROM {{ this }} t + WHERE t.{{ tracked_hashkey }} = ri.{{ tracked_hashkey }} + AND t.{{ src_ldts }} = ri.{{ src_ldts }} +) +{% endif %} -{%- endmacro -%} +{%- endmacro -%} \ No newline at end of file diff --git a/macros/tables/snowflake/eff_sat_v0.sql b/macros/tables/snowflake/eff_sat_v0.sql index 4965ed0d..c2b7a75e 100644 --- a/macros/tables/snowflake/eff_sat_v0.sql +++ b/macros/tables/snowflake/eff_sat_v0.sql @@ -1,27 +1,14 @@ -{%- macro snowflake__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} +{%- macro snowflake__eff_sat_v0(source_model, tracked_hashkey, src_ldts, src_rsrc, is_active_alias, source_is_single_batch, disable_hwm) -%} {%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} {%- set timestamp_format = datavault4dbt.timestamp_format() -%} -{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} +{%- set ns = namespace(last_cte= "") -%} -{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} -{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} -{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} - {%- set source_models = {source_models: {}} -%} -{%- endif -%} +{%- set source_relation = ref(source_model) -%} -{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} -{%- set source_models = source_model_values['source_model_list'] -%} -{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} -{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} -{{ log('source_models: '~source_models, false) }} - -{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} - -{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} {%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} -{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set is_active_alias = datavault4dbt.escape_column_names(is_active_alias) -%} {%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} {%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} @@ -30,224 +17,276 @@ {{ datavault4dbt.prepend_generated_by() }} WITH + {# - For incremental runs, three different cases can occur: - hashkey disappeared -> deleted_flag = 1 - hashkey reappeared -> deleted_flag = 0 - new hashkey appeared -> deleted_flag = 0 + In all cases, the source model is selected, and optionally a HWM is applied. #} -{% if is_incremental() and execute %} - - {# - First, the current status for each hashkey is queried - #} - current_status AS ( - +source_data AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }} + FROM {{ source_relation }} src + WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') + {%- if is_incremental() and not disable_hwm %} + AND src.{{ src_ldts }} > ( SELECT - {{ tracked_hashkey }}, - {{ deleted_flag_alias }}, - {{ src_rsrc }} + MAX({{ src_ldts }}) FROM {{ this }} - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) = 1 - - ), - - {% for source_model in source_models -%} - - {# - For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. - #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - new_hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - LEFT JOIN current_status cs - ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} - AND cs.{{ deleted_flag_alias }} = 0 - WHERE cs.{{ tracked_hashkey }} IS NULL - - {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} - - {# - All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. - If they can't be found anywhere, they are marked as deleted_flag = 1. - #} - disappeared_hashkeys AS ( - - SELECT DISTINCT - cs.{{ tracked_hashkey }}, - cs.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, - 1 as {{ deleted_flag_alias }} - FROM current_status cs - WHERE - {% for source_model in source_models %} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - {{ 'AND' if not loop.first }} - NOT EXISTS ( - SELECT - 1 - FROM {{ ref(source_model.name) }} src - WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} - ) - {% endfor %} - AND cs.{{ deleted_flag_alias }} = 0 + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} + ) + {%- endif %} +), - ), +{# + In all incremental cases, the current status for each hashkey is selected from the existing Effectivity Satellite. +#} +{%- if is_incremental() %} +current_status AS ( + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias }} + FROM {{ this }} + QUALIFY + ROW_NUMBER() OVER(PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) = 1 - {%- if source_models | length > 1 -%} +), +{% endif %} - {# - If more then one source_model is defined, the new hashkeys of all source_models are unioned. - #} - new_hashkeys_union AS ( +{# + This block is for multi-batch processing. +#} +{% if not source_is_single_batch %} - {%- for source_model in source_models -%} + {# + List of all Hashkeys with their date of first appearance in the source model. + #} + hashkeys AS ( - {%- set source_number = source_model.id | string -%} + SELECT + {{ tracked_hashkey }}, + MIN({{ src_ldts }}) as first_appearance + FROM source_data + GROUP BY {{ tracked_hashkey }} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_{{ source_number }} + ), - {%- if not loop.last %} - UNION - {% endif -%} + {# + Distinct list of load dates in the multi-batch source. + #} + load_dates AS ( - {%- endfor -%} + SELECT Distinct + {{ src_ldts }} + FROM source_data + + ), - ), + {# + All combinations of hashkeys and loaddates, for loaddates after the first appearance of a hashkey. + #} + history AS ( - new_hashkeys_union_dedupe AS ( + SELECT + hk.{{ tracked_hashkey }}, + ld.{{ src_ldts }} + FROM hashkeys hk + CROSS JOIN load_dates ld + WHERE ld.{{ src_ldts }} >= hk.first_appearance - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_union - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + ), - {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} + {# + All theoretical combinations are checked against the actual occurences of hashkeys in each batch / loaddate. + If a Hashkey is part of a load/batch, is_active_alias is set to 1, because the hashkey was active in that load/batch. + If a Hashkey is not part of a load/batch, is_active_alias is set to 0, because the hashkey was not active in that load/batch. + #} + is_active AS ( - ), + SELECT + h.{{ tracked_hashkey }}, + h.{{ src_ldts }}, + CASE + WHEN src.{{ tracked_hashkey }} IS NULL THEN 0 + ELSE 1 + END as {{ is_active_alias }} + FROM history h + LEFT JOIN source_data src + ON src.{{ tracked_hashkey }} = h.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = h.{{ src_ldts }} - {%- endif %} + ), {# - All hashkeys that have a status change should be inserted. - That includes new, reappeared, and disappeared hashkeys. + The rows are deduplicated on the is_active_alias, to only include status changes. + Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - records_to_insert AS ( + deduplicated_incoming AS ( SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.new_hashkeys_cte }} + is_active.{{ tracked_hashkey }}, + is_active.{{ src_ldts }}, + is_active.{{ is_active_alias }} - UNION + {% if is_incremental() -%} + , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn + {%- endif %} - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM disappeared_hashkeys + FROM is_active + QUALIFY + CASE + WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END - ) + ), + + {% set ns.last_cte = 'deduplicated_incoming' %} -{%- else %} +{# + This block is for single-batch processing +#} +{% else %} {# - In initial runs, every available hashkey is marked as deleted_flag = 0. + In initial loads of single-batch eff sats, every hashkey of the source is set to active. #} - {% for source_model in source_models %} + new_hashkeys AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey }}, + src.{{ src_ldts }}, + 1 as {{ is_active_alias }} + FROM source_data src {# - When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + For incremental runs of single-batch eff sats, only hashkeys that are not active right now are set to active. + This automatically includes totally new hashkeys, or hashkeys that are currently set to inactive. #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - - {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} + {% if is_incremental() %} + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ is_active_alias }} = 1 + WHERE cs.{{ tracked_hashkey }} IS NULL + {% endif %} - {% if source_models | length > 1 -%} + ), - hashkeys_union AS ( + {% set ns.last_cte = 'new_hashkeys' %} - {%- for source_model in source_models -%} +{% endif %} - {%- set source_number = source_model.id | string -%} +{# + In all incremental runs, the source needs to be scanned for all currently active hashkeys. + If they are no longer present, they will be deactived. +#} +{%- if is_incremental() %} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkeys_{{ source_number }} + {%- if not source_is_single_batch %} + disappeared_hashkeys AS ( - {%- if not loop.last %} - UNION - {% endif -%} + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM deduplicated_incoming) ldts + ON 1 = 1 + LEFT JOIN deduplicated_incoming src + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = ldts.min_ldts + WHERE + cs.{{ is_active_alias }} = 1 + AND src.{{ tracked_hashkey }} IS NULL + AND ldts.min_ldts IS NOT NULL - {%- endfor -%} + ), + {% else %} + disappeared_hashkeys AS ( + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM source_data) ldts + ON 1 = 1 + WHERE NOT EXISTS ( + SELECT + 1 + FROM source_data src + WHERE src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + ) + AND cs.{{ is_active_alias }} = 1 + AND ldts.min_ldts IS NOT NULL ), + {% endif %} +{%- endif %} - hashkey_union_dedupe AS ( +records_to_insert AS ( - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkeys_union - QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) = 1 + {# + This first part of the UNION includes: + - for single-batch loads: Only is_active_alias = 1, deactivations are handled later + - for multi-batch loads: Ativation and deactivation inside the multiple loads + #} + SELECT + di.{{ tracked_hashkey }}, + di.{{ src_ldts }}, + di.{{ is_active_alias }} + FROM {{ ns.last_cte }} di - {%- set ns.last_cte = 'hashkey_union_dedupe' -%} - ), + {%- if is_incremental() %} - {%- endif %} + {# + For incremental multi-batch loads, the earliest to-be inserted status is compared to the current status. + It will only be inserted if the status changed. We use the ROW_NUMBER() + #} + {%- if not source_is_single_batch %} + WHERE NOT EXISTS ( + SELECT 1 + FROM current_status + WHERE {{ datavault4dbt.multikey(tracked_hashkey, prefix=['current_status', 'di'], condition='=') }} + AND {{ datavault4dbt.multikey(is_active_alias, prefix=['current_status', 'di'], condition='=') }} + AND di.{{ src_ldts }} = (SELECT MIN({{ src_ldts }}) FROM deduplicated_incoming) + ) + AND di.{{ src_ldts }} > (SELECT MAX({{ src_ldts }}) FROM {{ this }}) + {% endif %} - records_to_insert AS ( + {# + For all incremental loads, the disappeared hashkeys are UNIONed. + #} + UNION - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.last_cte }} + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }}, + {{ is_active_alias }} + FROM disappeared_hashkeys - ) + {%- endif %} +) -{% endif %} +SELECT * +FROM records_to_insert ri -SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} -FROM records_to_insert +{% if is_incremental() %} +WHERE NOT EXISTS ( + SELECT 1 + FROM {{ this }} t + WHERE t.{{ tracked_hashkey }} = ri.{{ tracked_hashkey }} + AND t.{{ src_ldts }} = ri.{{ src_ldts }} +) +{% endif %} -{%- endmacro -%} +{%- endmacro -%} \ No newline at end of file diff --git a/macros/tables/synapse/eff_sat_v0.sql b/macros/tables/synapse/eff_sat_v0.sql index 4fd54205..f6257393 100644 --- a/macros/tables/synapse/eff_sat_v0.sql +++ b/macros/tables/synapse/eff_sat_v0.sql @@ -1,27 +1,14 @@ -{%- macro synapse__eff_sat_v0(source_models, tracked_hashkey, src_ldts, src_rsrc, deleted_flag_alias) -%} +{%- macro synapse__eff_sat_v0(source_model, tracked_hashkey, src_ldts, src_rsrc, is_active_alias, source_is_single_batch, disable_hwm) -%} {%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} {%- set timestamp_format = datavault4dbt.timestamp_format() -%} -{%- set ns = namespace(new_hashkeys_cte="", disappeared_hashkeys_cte="", last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} +{%- set ns = namespace(last_cte= "") -%} -{# If no rsrc_static parameter is defined in ANY of the source models then the whole code block of record_source performance lookup is not executed #} -{# For the use of record_source performance lookup it is required that every source model has the parameter rsrc_static defined and it cannot be an empty string #} -{%- if source_models is not mapping and not datavault4dbt.is_list(source_models) -%} - {%- set source_models = {source_models: {}} -%} -{%- endif -%} +{%- set source_relation = ref(source_model) -%} -{%- set source_model_values = fromjson(datavault4dbt.source_model_processing(source_models=source_models, parameters={'tracked_hashkey':tracked_hashkey})) -%} -{%- set source_models = source_model_values['source_model_list'] -%} -{%- set ns.has_rsrc_static_defined = source_model_values['has_rsrc_static_defined'] -%} -{%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} -{{ log('source_models: '~source_models, false) }} - -{%- set final_columns_to_select = [tracked_hashkey] + [src_rsrc] + [src_ldts] + [deleted_flag_alias] -%} - -{%- set final_columns_to_select = datavault4dbt.escape_column_names(final_columns_to_select) -%} {%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} -{%- set deleted_flag_alias = datavault4dbt.escape_column_names(deleted_flag_alias) -%} +{%- set is_active_alias = datavault4dbt.escape_column_names(is_active_alias) -%} {%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} {%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} @@ -30,259 +17,285 @@ {{ datavault4dbt.prepend_generated_by() }} WITH + {# - For incremental runs, three different cases can occur: - hashkey disappeared -> deleted_flag = 1 - hashkey reappeared -> deleted_flag = 0 - new hashkey appeared -> deleted_flag = 0 + In all cases, the source model is selected, and optionally a HWM is applied. #} -{% if is_incremental() and execute %} - - {# - First, the current status for each hashkey is queried - #} - current_status_prep AS ( - +source_data AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }} + FROM {{ source_relation }} src + WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') + {%- if is_incremental() and not disable_hwm %} + AND src.{{ src_ldts }} > ( SELECT - {{ tracked_hashkey }}, - {{ deleted_flag_alias }}, - {{ src_rsrc }}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn + MAX({{ src_ldts }}) FROM {{ this }} + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} + ) + {%- endif %} +), - ), - - current_status AS ( - - SELECT - {{ tracked_hashkey }}, - {{ deleted_flag_alias }}, - {{ src_rsrc }} - FROM current_status_prep - WHERE rn = 1 - - ), - - {% for source_model in source_models -%} - - {# - For each source_model, all hashkeys that are not yet in the effectivity satellite, or are currently marked as deleted, get 0 as deleted_flag. - #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - new_hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - LEFT JOIN current_status cs - ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} - AND cs.{{ deleted_flag_alias }} = 0 - WHERE cs.{{ tracked_hashkey }} IS NULL - - {%- set ns.new_hashkeys_cte = 'new_hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} - - {# - All hashkeys, that currently have deleted_flag = 0, are searched for in all source models. - If they can't be found anywhere, they are marked as deleted_flag = 1. - #} - disappeared_hashkeys AS ( - - SELECT DISTINCT - cs.{{ tracked_hashkey }}, - cs.{{ src_rsrc }}, - {{ datavault4dbt.current_timestamp() }} as {{ src_ldts }}, - 1 as {{ deleted_flag_alias }} - FROM current_status cs - WHERE - {% for source_model in source_models %} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - {{ 'AND' if not loop.first }} - NOT EXISTS ( - SELECT - 1 - FROM {{ ref(source_model.name) }} src - WHERE src.{{ tracked_hashkey_src }} = cs.{{ tracked_hashkey }} - ) - {% endfor %} - AND cs.{{ deleted_flag_alias }} = 0 +{# + In all incremental cases, the current status for each hashkey is selected from the existing Effectivity Satellite. +#} +{%- if is_incremental() %} +current_status_prep AS ( - ), + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias}}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) as rn + FROM {{ this }} +), - {%- if source_models | length > 1 -%} +current_status AS ( - {# - If more then one source_model is defined, the new hashkeys of all source_models are unioned. - #} - new_hashkeys_union AS ( + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias }} + FROM current_status_prep + WHERE rn = 1 - {%- for source_model in source_models -%} +), +{% endif %} - {%- set source_number = source_model.id | string -%} +{# + This block is for multi-batch processing. +#} +{% if not source_is_single_batch %} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_{{ source_number }} + {# + List of all Hashkeys with their date of first appearance in the source model. + #} + hashkeys AS ( - {%- if not loop.last %} - UNION - {% endif -%} + SELECT + {{ tracked_hashkey }}, + MIN({{ src_ldts }}) as first_appearance + FROM source_data + GROUP BY {{ tracked_hashkey }} - {%- endfor -%} + ), - ), + {# + Distinct list of load dates in the multi-batch source. + #} + load_dates AS ( - new_hashkeys_union_dedupe_prep AS ( + SELECT Distinct + {{ src_ldts }} + FROM source_data + + ), - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn - FROM new_hashkeys_union - - ), + {# + All combinations of hashkeys and loaddates, for loaddates after the first appearance of a hashkey. + #} + history AS ( - new_hashkeys_union_dedupe AS ( + SELECT + hk.{{ tracked_hashkey }}, + ld.{{ src_ldts }} + FROM hashkeys hk + CROSS JOIN load_dates ld + WHERE ld.{{ src_ldts }} >= hk.first_appearance - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM new_hashkeys_union_dedupe_prep - WHERE rn = 1 + ), - {%- set ns.new_hashkeys_cte = 'new_hashkeys_union_dedupe' -%} + {# + All theoretical combinations are checked against the actual occurences of hashkeys in each batch / loaddate. + If a Hashkey is part of a load/batch, is_active_alias is set to 1, because the hashkey was active in that load/batch. + If a Hashkey is not part of a load/batch, is_active_alias is set to 0, because the hashkey was not active in that load/batch. + #} + is_active AS ( - ), + SELECT + h.{{ tracked_hashkey }}, + h.{{ src_ldts }}, + CASE + WHEN src.{{ tracked_hashkey }} IS NULL THEN 0 + ELSE 1 + END as {{ is_active_alias }} + FROM history h + LEFT JOIN source_data src + ON src.{{ tracked_hashkey }} = h.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = h.{{ src_ldts }} - {%- endif %} + ), {# - All hashkeys that have a status change should be inserted. - That includes new, reappeared, and disappeared hashkeys. + The rows are deduplicated on the is_active_alias, to only include status changes. + Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - records_to_insert AS ( + deduplicated_incoming AS ( SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.new_hashkeys_cte }} + is_active.{{ tracked_hashkey }}, + is_active.{{ src_ldts }}, + is_active.{{ is_active_alias }} - UNION + {% if is_incremental() -%} + , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn + {%- endif %} - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM disappeared_hashkeys + FROM is_active + QUALIFY + CASE + WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END - ) + ), + + {% set ns.last_cte = 'deduplicated_incoming' %} -{%- else %} +{# + This block is for single-batch processing +#} +{% else %} {# - In initial runs, every available hashkey is marked as deleted_flag = 0. + In initial loads of single-batch eff sats, every hashkey of the source is set to active. #} - {% for source_model in source_models %} + new_hashkeys AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey }}, + src.{{ src_ldts }}, + 1 as {{ is_active_alias }} + FROM source_data src {# - When multiple source_models are defined, the hashkeys of all source_models are determined and unioned. + For incremental runs of single-batch eff sats, only hashkeys that are not active right now are set to active. + This automatically includes totally new hashkeys, or hashkeys that are currently set to inactive. #} - {%- set source_number = source_model.id | string -%} - {%- set tracked_hashkey_src = datavault4dbt.escape_column_names(source_model['tracked_hashkey']) -%} - - hashkeys_{{ source_number }} AS ( - - SELECT DISTINCT - src.{{ tracked_hashkey_src }} AS {{ tracked_hashkey }}, - src.{{ src_rsrc }}, - src.{{ src_ldts }}, - 0 as {{ deleted_flag_alias }} - FROM {{ ref(source_model.name) }} src - - {%- set ns.last_cte = 'hashkeys_' ~ source_number -%} - - ), - - {%- endfor %} - - {% if source_models | length > 1 -%} - - hashkeys_union AS ( + {% if is_incremental() %} + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ is_active_alias }} = 1 + WHERE cs.{{ tracked_hashkey }} IS NULL + {% endif %} - {%- for source_model in source_models -%} + ), - {%- set source_number = source_model.id | string -%} + {% set ns.last_cte = 'new_hashkeys' %} - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkeys_{{ source_number }} +{% endif %} - {%- if not loop.last %} - UNION - {% endif -%} +{# + In all incremental runs, the source needs to be scanned for all currently active hashkeys. + If they are no longer present, they will be deactived. +#} +{%- if is_incremental() %} - {%- endfor -%} + {%- if not source_is_single_batch %} + disappeared_hashkeys AS ( + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM deduplicated_incoming) ldts + ON 1 = 1 + LEFT JOIN deduplicated_incoming src + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = ldts.min_ldts + WHERE + cs.{{ is_active_alias }} = 1 + AND src.{{ tracked_hashkey }} IS NULL + AND ldts.min_ldts IS NOT NULL ), + {% else %} + disappeared_hashkeys AS ( - hashkey_union_dedupe_prep AS ( - - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} desc) as rn - FROM hashkeys_union + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM source_data) ldts + ON 1 = 1 + WHERE NOT EXISTS ( + SELECT + 1 + FROM source_data src + WHERE src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + ) + AND cs.{{ is_active_alias }} = 1 + AND ldts.min_ldts IS NOT NULL ), + {% endif %} +{%- endif %} - hashkey_union_dedupe AS ( +records_to_insert AS ( - SELECT - {{ tracked_hashkey }}, - {{ src_rsrc }}, - {{ src_ldts }}, - {{ deleted_flag_alias }} - FROM hashkey_union_dedupe_prep - WHERE rn = 1 + {# + This first part of the UNION includes: + - for single-batch loads: Only is_active_alias = 1, deactivations are handled later + - for multi-batch loads: Ativation and deactivation inside the multiple loads + #} + SELECT + di.{{ tracked_hashkey }}, + di.{{ src_ldts }}, + di.{{ is_active_alias }} + FROM {{ ns.last_cte }} di - {%- set ns.last_cte = 'hashkey_union_dedupe' -%} - ), + {%- if is_incremental() %} - {%- endif %} + {# + For incremental multi-batch loads, the earliest to-be inserted status is compared to the current status. + It will only be inserted if the status changed. We use the ROW_NUMBER() + #} + {%- if not source_is_single_batch %} + WHERE NOT EXISTS ( + SELECT 1 + FROM current_status + WHERE {{ datavault4dbt.multikey(tracked_hashkey, prefix=['current_status', 'di'], condition='=') }} + AND {{ datavault4dbt.multikey(is_active_alias, prefix=['current_status', 'di'], condition='=') }} + AND di.{{ src_ldts }} = (SELECT MIN({{ src_ldts }}) FROM deduplicated_incoming) + ) + AND di.{{ src_ldts }} > (SELECT MAX({{ src_ldts }}) FROM {{ this }}) + {% endif %} - records_to_insert AS ( + {# + For all incremental loads, the disappeared hashkeys are UNIONed. + #} + UNION - SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} - FROM {{ ns.last_cte }} + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }}, + {{ is_active_alias }} + FROM disappeared_hashkeys - ) + {%- endif %} +) -{% endif %} +SELECT * +FROM records_to_insert ri -SELECT - {{ datavault4dbt.print_list(final_columns_to_select) }} -FROM records_to_insert +{% if is_incremental() %} +WHERE NOT EXISTS ( + SELECT 1 + FROM {{ this }} t + WHERE t.{{ tracked_hashkey }} = ri.{{ tracked_hashkey }} + AND t.{{ src_ldts }} = ri.{{ src_ldts }} +) +{% endif %} -{%- endmacro -%} +{%- endmacro -%} \ No newline at end of file From 42a6e9bc2eda2dc26859963de45b437bb3cfc632 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Fri, 20 Sep 2024 11:57:36 +0200 Subject: [PATCH 10/22] change hardcoded default for is_active_alias to is_active --- macros/tables/eff_sat_v0.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/tables/eff_sat_v0.sql b/macros/tables/eff_sat_v0.sql index c5569bc9..2857e263 100644 --- a/macros/tables/eff_sat_v0.sql +++ b/macros/tables/eff_sat_v0.sql @@ -3,7 +3,7 @@ {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} {%- set src_rsrc = datavault4dbt.replace_standard(src_rsrc, 'datavault4dbt.rsrc_alias', 'rsrc') -%} - {%- set is_active_alias = datavault4dbt.replace_standard(is_active_alias, 'datavault4dbt.is_active_alias', 'deleted_flag') -%} + {%- set is_active_alias = datavault4dbt.replace_standard(is_active_alias, 'datavault4dbt.is_active_alias', 'is_active') -%} {{ return(adapter.dispatch('eff_sat_v0', 'datavault4dbt')(tracked_hashkey=tracked_hashkey, src_ldts=src_ldts, From 771d7bf5975d125bcd1435d5828c36131c5b6a92 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Fri, 20 Sep 2024 16:25:05 +0200 Subject: [PATCH 11/22] adapt eff sat macros for bigquery, exasol, postgres --- macros/tables/bigquery/eff_sat_v0.sql | 36 +++++++++++++++------- macros/tables/exasol/eff_sat_v0.sql | 27 +++++++++-------- macros/tables/postgres/eff_sat_v0.sql | 43 ++++++++++++++++++++------- 3 files changed, 72 insertions(+), 34 deletions(-) diff --git a/macros/tables/bigquery/eff_sat_v0.sql b/macros/tables/bigquery/eff_sat_v0.sql index 264fbb2a..690d49ad 100644 --- a/macros/tables/bigquery/eff_sat_v0.sql +++ b/macros/tables/bigquery/eff_sat_v0.sql @@ -1,6 +1,7 @@ {%- macro default__eff_sat_v0(source_model, tracked_hashkey, src_ldts, src_rsrc, is_active_alias, source_is_single_batch, disable_hwm) -%} {%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} +{%- set beginning_of_all_times = datavault4dbt.beginning_of_all_times() -%} {%- set timestamp_format = datavault4dbt.timestamp_format() -%} {%- set ns = namespace(last_cte= "") -%} @@ -27,7 +28,7 @@ source_data AS ( {{ tracked_hashkey }}, {{ src_ldts }} FROM {{ source_relation }} src - WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') + WHERE {{ src_ldts }} NOT IN ({{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }}, {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }}) {%- if is_incremental() and not disable_hwm %} AND src.{{ src_ldts }} > ( SELECT @@ -122,23 +123,38 @@ current_status AS ( The rows are deduplicated on the is_active_alias, to only include status changes. Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - deduplicated_incoming AS ( + deduplicated_incoming_prep AS ( SELECT is_active.{{ tracked_hashkey }}, is_active.{{ src_ldts }}, - is_active.{{ is_active_alias }} + is_active.{{ is_active_alias }}, + LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active {% if is_incremental() -%} , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn - {%- endif %} + {%- endif %} FROM is_active - QUALIFY - CASE - WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE - ELSE TRUE - END + + ), + + deduplicated_incoming AS ( + + SELECT + deduplicated_incoming_prep.{{ tracked_hashkey }}, + deduplicated_incoming_prep.{{ src_ldts }}, + deduplicated_incoming_prep.{{ is_active_alias }}, + LAG(deduplicated_incoming_prep.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active + + {% if is_incremental() -%} + , ROW_NUMBER() OVER(PARTITION BY deduplicated_incoming_prep.{{ tracked_hashkey }} ORDER BY deduplicated_incoming_prep.{{ src_ldts }}) as rn + {%- endif %} + + FROM + deduplicated_incoming_prep + WHERE + deduplicated_incoming_prep.{{ is_active_alias }} != deduplicated_incoming_prep.lag_is_active ), @@ -265,7 +281,7 @@ records_to_insert AS ( {# For all incremental loads, the disappeared hashkeys are UNIONed. #} - UNION + UNION ALL SELECT {{ tracked_hashkey }}, diff --git a/macros/tables/exasol/eff_sat_v0.sql b/macros/tables/exasol/eff_sat_v0.sql index 7640ec81..9248be70 100644 --- a/macros/tables/exasol/eff_sat_v0.sql +++ b/macros/tables/exasol/eff_sat_v0.sql @@ -1,6 +1,7 @@ {%- macro exasol__eff_sat_v0(source_model, tracked_hashkey, src_ldts, src_rsrc, is_active_alias, source_is_single_batch, disable_hwm) -%} {%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} +{%- set beginning_of_all_times = datavault4dbt.beginning_of_all_times() -%} {%- set timestamp_format = datavault4dbt.timestamp_format() -%} {%- set ns = namespace(last_cte= "") -%} @@ -27,7 +28,7 @@ source_data AS ( {{ tracked_hashkey }}, {{ src_ldts }} FROM {{ source_relation }} src - WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') + WHERE {{ src_ldts }} NOT IN ({{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }}, {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }}) {%- if is_incremental() and not disable_hwm %} AND src.{{ src_ldts }} > ( SELECT @@ -165,10 +166,10 @@ current_status AS ( This automatically includes totally new hashkeys, or hashkeys that are currently set to inactive. #} {% if is_incremental() %} - LEFT JOIN current_status cs - ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} - AND cs.{{ is_active_alias }} = 1 - WHERE cs.{{ tracked_hashkey }} IS NULL + LEFT JOIN current_status cus + ON src.{{ tracked_hashkey }} = cus.{{ tracked_hashkey }} + AND cus.{{ is_active_alias }} = 1 + WHERE cus.{{ tracked_hashkey }} IS NULL {% endif %} ), @@ -187,20 +188,20 @@ current_status AS ( disappeared_hashkeys AS ( SELECT DISTINCT - cs.{{ tracked_hashkey }}, + cus.{{ tracked_hashkey }}, ldts.min_ldts as {{ src_ldts }}, 0 as {{ is_active_alias }} - FROM current_status cs + FROM current_status cus LEFT JOIN ( SELECT MIN({{ src_ldts }}) as min_ldts FROM deduplicated_incoming) ldts ON 1 = 1 LEFT JOIN deduplicated_incoming src - ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + ON src.{{ tracked_hashkey }} = cus.{{ tracked_hashkey }} AND src.{{ src_ldts }} = ldts.min_ldts WHERE - cs.{{ is_active_alias }} = 1 + cus.{{ is_active_alias }} = 1 AND src.{{ tracked_hashkey }} IS NULL AND ldts.min_ldts IS NOT NULL @@ -209,10 +210,10 @@ current_status AS ( disappeared_hashkeys AS ( SELECT DISTINCT - cs.{{ tracked_hashkey }}, + cus.{{ tracked_hashkey }}, ldts.min_ldts as {{ src_ldts }}, 0 as {{ is_active_alias }} - FROM current_status cs + FROM current_status cus LEFT JOIN ( SELECT MIN({{ src_ldts }}) as min_ldts @@ -222,9 +223,9 @@ current_status AS ( SELECT 1 FROM source_data src - WHERE src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + WHERE src.{{ tracked_hashkey }} = cus.{{ tracked_hashkey }} ) - AND cs.{{ is_active_alias }} = 1 + AND cus.{{ is_active_alias }} = 1 AND ldts.min_ldts IS NOT NULL ), diff --git a/macros/tables/postgres/eff_sat_v0.sql b/macros/tables/postgres/eff_sat_v0.sql index dcad92a7..d2c71f49 100644 --- a/macros/tables/postgres/eff_sat_v0.sql +++ b/macros/tables/postgres/eff_sat_v0.sql @@ -21,6 +21,16 @@ WITH {# In all cases, the source model is selected, and optionally a HWM is applied. #} +{% if is_incremental() and not disable_hwm %} +max_ldts_prep AS ( + + SELECT + MAX({{ src_ldts }}) AS max_ldts + FROM {{ this }} + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} +), +{% endif %} + source_data AS ( SELECT @@ -30,10 +40,7 @@ source_data AS ( WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') {%- if is_incremental() and not disable_hwm %} AND src.{{ src_ldts }} > ( - SELECT - MAX({{ src_ldts }}) - FROM {{ this }} - WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} + SELECT max_ldts FROM max_ldts_prep ) {%- endif %} ), @@ -131,24 +138,38 @@ current_status AS ( The rows are deduplicated on the is_active_alias, to only include status changes. Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - deduplicated_incoming AS ( + deduplicated_incoming_prep AS ( SELECT is_active.{{ tracked_hashkey }}, is_active.{{ src_ldts }}, is_active.{{ is_active_alias }}, - LAG(is_active.{{ is_active_alias }} OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }})) as lag_is_active + LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active {% if is_incremental() -%} , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn {%- endif %} FROM is_active - QUALIFY - CASE - WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE - ELSE TRUE - END + + ), + + deduplicated_incoming AS ( + + SELECT + deduplicated_incoming_prep.{{ tracked_hashkey }}, + deduplicated_incoming_prep.{{ src_ldts }}, + deduplicated_incoming_prep.{{ is_active_alias }}, + LAG(deduplicated_incoming_prep.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active + + {% if is_incremental() -%} + , ROW_NUMBER() OVER(PARTITION BY deduplicated_incoming_prep.{{ tracked_hashkey }} ORDER BY deduplicated_incoming_prep.{{ src_ldts }}) as rn + {%- endif %} + + FROM + deduplicated_incoming_prep + WHERE + deduplicated_incoming_prep.{{ is_active_alias }} != deduplicated_incoming_prep.lag_is_active ), From 3cda87b04a3d62bdfd203a8d2656625e68992d28 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Fri, 20 Sep 2024 16:25:58 +0200 Subject: [PATCH 12/22] cast exasol string_to_timestamp to timestamp_default_dtype --- macros/supporting/string_to_timestamp.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/supporting/string_to_timestamp.sql b/macros/supporting/string_to_timestamp.sql index 96313450..af6c4f90 100644 --- a/macros/supporting/string_to_timestamp.sql +++ b/macros/supporting/string_to_timestamp.sql @@ -8,7 +8,7 @@ {%- endmacro -%} {%- macro exasol__string_to_timestamp(format, timestamp) -%} - TO_TIMESTAMP('{{ timestamp }}', '{{ format }}') + CAST(TO_TIMESTAMP('{{ timestamp }}', '{{ format }}') AS {{ datavault4dbt.timestamp_default_dtype() }}) {%- endmacro -%} {%- macro snowflake__string_to_timestamp(format, timestamp) -%} From 130bba28417ede429373f797ece80febc0435c86 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 25 Sep 2024 12:39:55 +0200 Subject: [PATCH 13/22] remove unused rownumber and LAG() --- macros/tables/bigquery/eff_sat_v0.sql | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/macros/tables/bigquery/eff_sat_v0.sql b/macros/tables/bigquery/eff_sat_v0.sql index 690d49ad..ee1fa47b 100644 --- a/macros/tables/bigquery/eff_sat_v0.sql +++ b/macros/tables/bigquery/eff_sat_v0.sql @@ -131,10 +131,6 @@ current_status AS ( is_active.{{ is_active_alias }}, LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active - {% if is_incremental() -%} - , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn - {%- endif %} - FROM is_active ), @@ -145,11 +141,7 @@ current_status AS ( deduplicated_incoming_prep.{{ tracked_hashkey }}, deduplicated_incoming_prep.{{ src_ldts }}, deduplicated_incoming_prep.{{ is_active_alias }}, - LAG(deduplicated_incoming_prep.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active - - {% if is_incremental() -%} - , ROW_NUMBER() OVER(PARTITION BY deduplicated_incoming_prep.{{ tracked_hashkey }} ORDER BY deduplicated_incoming_prep.{{ src_ldts }}) as rn - {%- endif %} + deduplicated_incoming_prep.lag_is_active FROM deduplicated_incoming_prep From 3b426879ee7187904816c4916f3d3946a2f58ad7 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Fri, 27 Sep 2024 16:53:45 +0200 Subject: [PATCH 14/22] finalize eff_sats for bigquery, posgres, redshift and synapse --- macros/tables/bigquery/eff_sat_v0.sql | 4 +-- macros/tables/postgres/eff_sat_v0.sql | 12 ++------ macros/tables/redshift/eff_sat_v0.sql | 44 ++++++++++++++++++--------- macros/tables/synapse/eff_sat_v0.sql | 44 ++++++++++++++++++--------- 4 files changed, 62 insertions(+), 42 deletions(-) diff --git a/macros/tables/bigquery/eff_sat_v0.sql b/macros/tables/bigquery/eff_sat_v0.sql index ee1fa47b..b9b70a80 100644 --- a/macros/tables/bigquery/eff_sat_v0.sql +++ b/macros/tables/bigquery/eff_sat_v0.sql @@ -140,13 +140,13 @@ current_status AS ( SELECT deduplicated_incoming_prep.{{ tracked_hashkey }}, deduplicated_incoming_prep.{{ src_ldts }}, - deduplicated_incoming_prep.{{ is_active_alias }}, - deduplicated_incoming_prep.lag_is_active + deduplicated_incoming_prep.{{ is_active_alias }} FROM deduplicated_incoming_prep WHERE deduplicated_incoming_prep.{{ is_active_alias }} != deduplicated_incoming_prep.lag_is_active + OR deduplicated_incoming_prep.lag_is_active IS NULL ), diff --git a/macros/tables/postgres/eff_sat_v0.sql b/macros/tables/postgres/eff_sat_v0.sql index d2c71f49..ef8d63c8 100644 --- a/macros/tables/postgres/eff_sat_v0.sql +++ b/macros/tables/postgres/eff_sat_v0.sql @@ -146,10 +146,6 @@ current_status AS ( is_active.{{ is_active_alias }}, LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active - {% if is_incremental() -%} - , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn - {%- endif %} - FROM is_active ), @@ -159,17 +155,13 @@ current_status AS ( SELECT deduplicated_incoming_prep.{{ tracked_hashkey }}, deduplicated_incoming_prep.{{ src_ldts }}, - deduplicated_incoming_prep.{{ is_active_alias }}, - LAG(deduplicated_incoming_prep.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active - - {% if is_incremental() -%} - , ROW_NUMBER() OVER(PARTITION BY deduplicated_incoming_prep.{{ tracked_hashkey }} ORDER BY deduplicated_incoming_prep.{{ src_ldts }}) as rn - {%- endif %} + deduplicated_incoming_prep.{{ is_active_alias }} FROM deduplicated_incoming_prep WHERE deduplicated_incoming_prep.{{ is_active_alias }} != deduplicated_incoming_prep.lag_is_active + OR deduplicated_incoming_prep.lag_is_active IS NULL ), diff --git a/macros/tables/redshift/eff_sat_v0.sql b/macros/tables/redshift/eff_sat_v0.sql index 7133c700..3483db4a 100644 --- a/macros/tables/redshift/eff_sat_v0.sql +++ b/macros/tables/redshift/eff_sat_v0.sql @@ -21,6 +21,16 @@ WITH {# In all cases, the source model is selected, and optionally a HWM is applied. #} +{% if is_incremental() and not disable_hwm %} +max_ldts_prep AS ( + + SELECT + MAX({{ src_ldts }}) AS max_ldts + FROM {{ this }} + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} +), +{% endif %} + source_data AS ( SELECT @@ -30,10 +40,7 @@ source_data AS ( WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') {%- if is_incremental() and not disable_hwm %} AND src.{{ src_ldts }} > ( - SELECT - MAX({{ src_ldts }}) - FROM {{ this }} - WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} + SELECT max_ldts FROM max_ldts_prep ) {%- endif %} ), @@ -131,23 +138,30 @@ current_status AS ( The rows are deduplicated on the is_active_alias, to only include status changes. Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - deduplicated_incoming AS ( + deduplicated_incoming_prep AS ( SELECT is_active.{{ tracked_hashkey }}, is_active.{{ src_ldts }}, - is_active.{{ is_active_alias }} - - {% if is_incremental() -%} - , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn - {%- endif %} + is_active.{{ is_active_alias }}, + LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active FROM is_active - QUALIFY - CASE - WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE - ELSE TRUE - END + + ), + + deduplicated_incoming AS ( + + SELECT + deduplicated_incoming_prep.{{ tracked_hashkey }}, + deduplicated_incoming_prep.{{ src_ldts }}, + deduplicated_incoming_prep.{{ is_active_alias }} + + FROM + deduplicated_incoming_prep + WHERE + deduplicated_incoming_prep.{{ is_active_alias }} != deduplicated_incoming_prep.lag_is_active + OR deduplicated_incoming_prep.lag_is_active IS NULL ), diff --git a/macros/tables/synapse/eff_sat_v0.sql b/macros/tables/synapse/eff_sat_v0.sql index f6257393..3bac2383 100644 --- a/macros/tables/synapse/eff_sat_v0.sql +++ b/macros/tables/synapse/eff_sat_v0.sql @@ -21,6 +21,16 @@ WITH {# In all cases, the source model is selected, and optionally a HWM is applied. #} +{% if is_incremental() and not disable_hwm %} +max_ldts_prep AS ( + + SELECT + MAX({{ src_ldts }}) AS max_ldts + FROM {{ this }} + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} +), +{% endif %} + source_data AS ( SELECT @@ -30,10 +40,7 @@ source_data AS ( WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') {%- if is_incremental() and not disable_hwm %} AND src.{{ src_ldts }} > ( - SELECT - MAX({{ src_ldts }}) - FROM {{ this }} - WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} + SELECT max_ldts FROM max_ldts_prep ) {%- endif %} ), @@ -131,23 +138,30 @@ current_status AS ( The rows are deduplicated on the is_active_alias, to only include status changes. Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - deduplicated_incoming AS ( + deduplicated_incoming_prep AS ( SELECT is_active.{{ tracked_hashkey }}, is_active.{{ src_ldts }}, - is_active.{{ is_active_alias }} - - {% if is_incremental() -%} - , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn - {%- endif %} + is_active.{{ is_active_alias }}, + LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active FROM is_active - QUALIFY - CASE - WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE - ELSE TRUE - END + + ), + + deduplicated_incoming AS ( + + SELECT + deduplicated_incoming_prep.{{ tracked_hashkey }}, + deduplicated_incoming_prep.{{ src_ldts }}, + deduplicated_incoming_prep.{{ is_active_alias }} + + FROM + deduplicated_incoming_prep + WHERE + deduplicated_incoming_prep.{{ is_active_alias }} != deduplicated_incoming_prep.lag_is_active + OR deduplicated_incoming_prep.lag_is_active IS NULL ), From 255424505d2296ece9e5bd657f4968fc8bdfd565 Mon Sep 17 00:00:00 2001 From: Tim Derksen Date: Fri, 11 Oct 2024 13:57:30 +0200 Subject: [PATCH 15/22] add eff_sat for oracle --- macros/tables/oracle/eff_sat_v0.sql | 316 ++++++++++++++++++++++++++++ 1 file changed, 316 insertions(+) create mode 100644 macros/tables/oracle/eff_sat_v0.sql diff --git a/macros/tables/oracle/eff_sat_v0.sql b/macros/tables/oracle/eff_sat_v0.sql new file mode 100644 index 00000000..2ff92746 --- /dev/null +++ b/macros/tables/oracle/eff_sat_v0.sql @@ -0,0 +1,316 @@ +{%- macro oracle__eff_sat_v0(source_model, tracked_hashkey, src_ldts, src_rsrc, is_active_alias, source_is_single_batch, disable_hwm) -%} + +{%- set beginning_of_all_times = datavault4dbt.beginning_of_all_times() -%} +{%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} +{%- set timestamp_format = datavault4dbt.timestamp_format() -%} + +{%- set ns = namespace(last_cte= "") -%} + +{%- set source_relation = ref(source_model) -%} + +{%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} +{%- set is_active_alias = datavault4dbt.escape_column_names(is_active_alias) -%} +{%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} +{%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} + +{{ log('columns to select: '~final_columns_to_select, false) }} + +{{ datavault4dbt.prepend_generated_by() }} + +WITH + +{# + In all cases, the source model is selected, and optionally a HWM is applied. +#} +{% if is_incremental() and not disable_hwm %} +max_ldts_prep AS ( + + SELECT + MAX({{ src_ldts }}) AS max_ldts + FROM {{ this }} + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} +), +{% endif %} + +source_data AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }} + FROM {{ source_relation }} src + WHERE {{ src_ldts }} NOT IN ({{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }}, {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }}) + {%- if is_incremental() and not disable_hwm %} + AND src.{{ src_ldts }} > ( + SELECT max_ldts FROM max_ldts_prep + ) + {%- endif %} +), + +{# + In all incremental cases, the current status for each hashkey is selected from the existing Effectivity Satellite. +#} +{%- if is_incremental() %} +current_status_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias}}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) as rn + FROM {{ this }} + +), + +current_status AS ( + + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias }} + FROM current_status_prep + WHERE rn = 1 + +), +{% endif %} + +{# + This block is for multi-batch processing. +#} +{% if not source_is_single_batch %} + + {# + List of all Hashkeys with their date of first appearance in the source model. + #} + hashkeys AS ( + + SELECT + {{ tracked_hashkey }}, + MIN({{ src_ldts }}) as first_appearance + FROM source_data + GROUP BY {{ tracked_hashkey }} + + ), + + {# + Distinct list of load dates in the multi-batch source. + #} + load_dates AS ( + + SELECT Distinct + {{ src_ldts }} + FROM source_data + + ), + + {# + All combinations of hashkeys and loaddates, for loaddates after the first appearance of a hashkey. + #} + history AS ( + + SELECT + hk.{{ tracked_hashkey }}, + ld.{{ src_ldts }} + FROM hashkeys hk + CROSS JOIN load_dates ld + WHERE ld.{{ src_ldts }} >= hk.first_appearance + + ), + + {# + All theoretical combinations are checked against the actual occurences of hashkeys in each batch / loaddate. + If a Hashkey is part of a load/batch, is_active_alias is set to 1, because the hashkey was active in that load/batch. + If a Hashkey is not part of a load/batch, is_active_alias is set to 0, because the hashkey was not active in that load/batch. + #} + is_active AS ( + + SELECT + h.{{ tracked_hashkey }}, + h.{{ src_ldts }}, + CASE + WHEN src.{{ tracked_hashkey }} IS NULL THEN 0 + ELSE 1 + END as {{ is_active_alias }} + FROM history h + LEFT JOIN source_data src + ON src.{{ tracked_hashkey }} = h.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = h.{{ src_ldts }} + + ), + + {# + The rows are deduplicated on the is_active_alias, to only include status changes. + Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. + #} + deduplicated_incoming_prep AS ( + + SELECT + is_active.{{ tracked_hashkey }}, + is_active.{{ src_ldts }}, + is_active.{{ is_active_alias }}, + LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active + + FROM is_active + + ), + + deduplicated_incoming AS ( + + SELECT + deduplicated_incoming_prep.{{ tracked_hashkey }}, + deduplicated_incoming_prep.{{ src_ldts }}, + deduplicated_incoming_prep.{{ is_active_alias }} + + FROM + deduplicated_incoming_prep + WHERE + deduplicated_incoming_prep.{{ is_active_alias }} != deduplicated_incoming_prep.lag_is_active + OR deduplicated_incoming_prep.lag_is_active IS NULL + + ), + + {% set ns.last_cte = 'deduplicated_incoming' %} + +{# + This block is for single-batch processing +#} +{% else %} + + {# + In initial loads of single-batch eff sats, every hashkey of the source is set to active. + #} + new_hashkeys AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey }}, + src.{{ src_ldts }}, + 1 as {{ is_active_alias }} + FROM source_data src + + {# + For incremental runs of single-batch eff sats, only hashkeys that are not active right now are set to active. + This automatically includes totally new hashkeys, or hashkeys that are currently set to inactive. + #} + {% if is_incremental() %} + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ is_active_alias }} = 1 + WHERE cs.{{ tracked_hashkey }} IS NULL + {% endif %} + + ), + + {% set ns.last_cte = 'new_hashkeys' %} + +{% endif %} + +{# + In all incremental runs, the source needs to be scanned for all currently active hashkeys. + If they are no longer present, they will be deactived. +#} +{%- if is_incremental() %} + + {%- if not source_is_single_batch %} + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM deduplicated_incoming) ldts + ON 1 = 1 + LEFT JOIN deduplicated_incoming src + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = ldts.min_ldts + WHERE + cs.{{ is_active_alias }} = 1 + AND src.{{ tracked_hashkey }} IS NULL + AND ldts.min_ldts IS NOT NULL + + ), + {% else %} + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM source_data) ldts + ON 1 = 1 + WHERE NOT EXISTS ( + SELECT + 1 + FROM source_data src + WHERE src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + ) + AND cs.{{ is_active_alias }} = 1 + AND ldts.min_ldts IS NOT NULL + + ), + {% endif %} +{%- endif %} + +records_to_insert AS ( + + {# + This first part of the UNION includes: + - for single-batch loads: Only is_active_alias = 1, deactivations are handled later + - for multi-batch loads: Ativation and deactivation inside the multiple loads + #} + SELECT + di.{{ tracked_hashkey }}, + di.{{ src_ldts }}, + di.{{ is_active_alias }} + FROM {{ ns.last_cte }} di + + + {%- if is_incremental() %} + + {# + For incremental multi-batch loads, the earliest to-be inserted status is compared to the current status. + It will only be inserted if the status changed. We use the ROW_NUMBER() + #} + {%- if not source_is_single_batch %} + WHERE NOT EXISTS ( + SELECT 1 + FROM current_status + WHERE {{ datavault4dbt.multikey(tracked_hashkey, prefix=['current_status', 'di'], condition='=') }} + AND {{ datavault4dbt.multikey(is_active_alias, prefix=['current_status', 'di'], condition='=') }} + AND di.{{ src_ldts }} = (SELECT MIN({{ src_ldts }}) FROM deduplicated_incoming) + ) + AND di.{{ src_ldts }} > (SELECT MAX({{ src_ldts }}) FROM {{ this }}) + {% endif %} + + {# + For all incremental loads, the disappeared hashkeys are UNIONed. + #} + UNION + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }}, + {{ is_active_alias }} + FROM disappeared_hashkeys + + {%- endif %} + +) + +SELECT * +FROM records_to_insert ri + +{% if is_incremental() %} +WHERE NOT EXISTS ( + SELECT 1 + FROM {{ this }} t + WHERE t.{{ tracked_hashkey }} = ri.{{ tracked_hashkey }} + AND t.{{ src_ldts }} = ri.{{ src_ldts }} +) +{% endif %} + +{%- endmacro -%} \ No newline at end of file From 26823c87279f860e16db599edc690923a440e76f Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 23 Oct 2024 14:12:41 +0200 Subject: [PATCH 16/22] add eff_sat_v0 for fabric --- macros/tables/fabric/eff_sat_v0.sql | 315 ++++++++++++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 macros/tables/fabric/eff_sat_v0.sql diff --git a/macros/tables/fabric/eff_sat_v0.sql b/macros/tables/fabric/eff_sat_v0.sql new file mode 100644 index 00000000..c50d5d34 --- /dev/null +++ b/macros/tables/fabric/eff_sat_v0.sql @@ -0,0 +1,315 @@ +{%- macro fabric__eff_sat_v0(source_model, tracked_hashkey, src_ldts, src_rsrc, is_active_alias, source_is_single_batch, disable_hwm) -%} + +{%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} +{%- set timestamp_format = datavault4dbt.timestamp_format() -%} + +{%- set ns = namespace(last_cte= "") -%} + +{%- set source_relation = ref(source_model) -%} + +{%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} +{%- set is_active_alias = datavault4dbt.escape_column_names(is_active_alias) -%} +{%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} +{%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} + +{{ log('columns to select: '~final_columns_to_select, false) }} + +{{ datavault4dbt.prepend_generated_by() }} + +WITH + +{# + In all cases, the source model is selected, and optionally a HWM is applied. +#} +{% if is_incremental() and not disable_hwm %} +max_ldts_prep AS ( + + SELECT + MAX({{ src_ldts }}) AS max_ldts + FROM {{ this }} + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} +), +{% endif %} + +source_data AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }} + FROM {{ source_relation }} src + WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') + {%- if is_incremental() and not disable_hwm %} + AND src.{{ src_ldts }} > ( + SELECT max_ldts FROM max_ldts_prep + ) + {%- endif %} +), + +{# + In all incremental cases, the current status for each hashkey is selected from the existing Effectivity Satellite. +#} +{%- if is_incremental() %} +current_status_prep AS ( + + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias}}, + ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) as rn + FROM {{ this }} + +), + +current_status AS ( + + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias }} + FROM current_status_prep + WHERE rn = 1 + +), +{% endif %} + +{# + This block is for multi-batch processing. +#} +{% if not source_is_single_batch %} + + {# + List of all Hashkeys with their date of first appearance in the source model. + #} + hashkeys AS ( + + SELECT + {{ tracked_hashkey }}, + MIN({{ src_ldts }}) as first_appearance + FROM source_data + GROUP BY {{ tracked_hashkey }} + + ), + + {# + Distinct list of load dates in the multi-batch source. + #} + load_dates AS ( + + SELECT Distinct + {{ src_ldts }} + FROM source_data + + ), + + {# + All combinations of hashkeys and loaddates, for loaddates after the first appearance of a hashkey. + #} + history AS ( + + SELECT + hk.{{ tracked_hashkey }}, + ld.{{ src_ldts }} + FROM hashkeys hk + CROSS JOIN load_dates ld + WHERE ld.{{ src_ldts }} >= hk.first_appearance + + ), + + {# + All theoretical combinations are checked against the actual occurences of hashkeys in each batch / loaddate. + If a Hashkey is part of a load/batch, is_active_alias is set to 1, because the hashkey was active in that load/batch. + If a Hashkey is not part of a load/batch, is_active_alias is set to 0, because the hashkey was not active in that load/batch. + #} + is_active AS ( + + SELECT + h.{{ tracked_hashkey }}, + h.{{ src_ldts }}, + CASE + WHEN src.{{ tracked_hashkey }} IS NULL THEN 0 + ELSE 1 + END as {{ is_active_alias }} + FROM history h + LEFT JOIN source_data src + ON src.{{ tracked_hashkey }} = h.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = h.{{ src_ldts }} + + ), + + {# + The rows are deduplicated on the is_active_alias, to only include status changes. + Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. + #} + deduplicated_incoming_prep AS ( + + SELECT + is_active.{{ tracked_hashkey }}, + is_active.{{ src_ldts }}, + is_active.{{ is_active_alias }}, + LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active + + FROM is_active + + ), + + deduplicated_incoming AS ( + + SELECT + deduplicated_incoming_prep.{{ tracked_hashkey }}, + deduplicated_incoming_prep.{{ src_ldts }}, + deduplicated_incoming_prep.{{ is_active_alias }} + + FROM + deduplicated_incoming_prep + WHERE + deduplicated_incoming_prep.{{ is_active_alias }} != deduplicated_incoming_prep.lag_is_active + OR deduplicated_incoming_prep.lag_is_active IS NULL + + ), + + {% set ns.last_cte = 'deduplicated_incoming' %} + +{# + This block is for single-batch processing +#} +{% else %} + + {# + In initial loads of single-batch eff sats, every hashkey of the source is set to active. + #} + new_hashkeys AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey }}, + src.{{ src_ldts }}, + 1 as {{ is_active_alias }} + FROM source_data src + + {# + For incremental runs of single-batch eff sats, only hashkeys that are not active right now are set to active. + This automatically includes totally new hashkeys, or hashkeys that are currently set to inactive. + #} + {% if is_incremental() %} + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ is_active_alias }} = 1 + WHERE cs.{{ tracked_hashkey }} IS NULL + {% endif %} + + ), + + {% set ns.last_cte = 'new_hashkeys' %} + +{% endif %} + +{# + In all incremental runs, the source needs to be scanned for all currently active hashkeys. + If they are no longer present, they will be deactived. +#} +{%- if is_incremental() %} + + {%- if not source_is_single_batch %} + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM deduplicated_incoming) ldts + ON 1 = 1 + LEFT JOIN deduplicated_incoming src + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = ldts.min_ldts + WHERE + cs.{{ is_active_alias }} = 1 + AND src.{{ tracked_hashkey }} IS NULL + AND ldts.min_ldts IS NOT NULL + + ), + {% else %} + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM source_data) ldts + ON 1 = 1 + WHERE NOT EXISTS ( + SELECT + 1 + FROM source_data src + WHERE src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + ) + AND cs.{{ is_active_alias }} = 1 + AND ldts.min_ldts IS NOT NULL + + ), + {% endif %} +{%- endif %} + +records_to_insert AS ( + + {# + This first part of the UNION includes: + - for single-batch loads: Only is_active_alias = 1, deactivations are handled later + - for multi-batch loads: Ativation and deactivation inside the multiple loads + #} + SELECT + di.{{ tracked_hashkey }}, + di.{{ src_ldts }}, + di.{{ is_active_alias }} + FROM {{ ns.last_cte }} di + + + {%- if is_incremental() %} + + {# + For incremental multi-batch loads, the earliest to-be inserted status is compared to the current status. + It will only be inserted if the status changed. We use the ROW_NUMBER() + #} + {%- if not source_is_single_batch %} + WHERE NOT EXISTS ( + SELECT 1 + FROM current_status + WHERE {{ datavault4dbt.multikey(tracked_hashkey, prefix=['current_status', 'di'], condition='=') }} + AND {{ datavault4dbt.multikey(is_active_alias, prefix=['current_status', 'di'], condition='=') }} + AND di.{{ src_ldts }} = (SELECT MIN({{ src_ldts }}) FROM deduplicated_incoming) + ) + AND di.{{ src_ldts }} > (SELECT MAX({{ src_ldts }}) FROM {{ this }}) + {% endif %} + + {# + For all incremental loads, the disappeared hashkeys are UNIONed. + #} + UNION + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }}, + {{ is_active_alias }} + FROM disappeared_hashkeys + + {%- endif %} + +) + +SELECT * +FROM records_to_insert ri + +{% if is_incremental() %} +WHERE NOT EXISTS ( + SELECT 1 + FROM {{ this }} t + WHERE t.{{ tracked_hashkey }} = ri.{{ tracked_hashkey }} + AND t.{{ src_ldts }} = ri.{{ src_ldts }} +) +{% endif %} + +{%- endmacro -%} \ No newline at end of file From 547447c9cf05c47cb235babebc94a8c01c08aa60 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Mon, 11 Nov 2024 11:59:01 +0100 Subject: [PATCH 17/22] fix:synapse/exasol: ghostrecords for timestamps didnt use alias-variable --- macros/supporting/ghost_record_per_datatype.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/macros/supporting/ghost_record_per_datatype.sql b/macros/supporting/ghost_record_per_datatype.sql index 341715dc..03698646 100644 --- a/macros/supporting/ghost_record_per_datatype.sql +++ b/macros/supporting/ghost_record_per_datatype.sql @@ -82,7 +82,7 @@ {%- if ghost_record_type == 'unknown' -%} - {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH LOCAL TIMEZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }} as "{{ column_name }}" + {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH LOCAL TIMEZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }} as {{ alias }} {%- elif datatype == 'DATE'-%} TO_DATE('{{ beginning_of_all_times_date }}', '{{ date_format }}' ) as {{ alias }} {%- elif datatype.upper().startswith('VARCHAR') -%} {%- if col_size is not none -%} @@ -110,7 +110,7 @@ {%- elif ghost_record_type == 'error' -%} - {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH LOCAL TIME ZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }} as "{{ column_name }}" + {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH LOCAL TIME ZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }} as {{ alias }} {%- elif datatype == 'DATE'-%} TO_DATE('{{ end_of_all_times_date }}', '{{ date_format }}' ) as {{ alias }} {%- elif datatype.upper().startswith('VARCHAR') -%} {%- if col_size is not none -%} @@ -267,7 +267,7 @@ {%- if ghost_record_type == 'unknown' -%} - {%- if datatype in ['DATETIME', 'DATETIME2', 'DATETIMEOFFSET'] %} CONVERT({{ datatype }}, {{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }}) as "{{ column_name }}" + {%- if datatype in ['DATETIME', 'DATETIME2', 'DATETIMEOFFSET'] %} CONVERT({{ datatype }}, {{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }}) as "{{ alias }}" {%- elif 'CHAR' in datatype -%} {%- if col_size is not none -%} {%- if (col_size | int) == -1 -%} @@ -297,7 +297,7 @@ {%- elif ghost_record_type == 'error' -%} - {%- if datatype in ['DATETIME', 'DATETIME2', 'DATETIMEOFFSET'] %} CONVERT({{ datatype }}, {{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }}) as "{{ column_name }}" + {%- if datatype in ['DATETIME', 'DATETIME2', 'DATETIMEOFFSET'] %} CONVERT({{ datatype }}, {{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }}) as "{{ alias }}" {%- elif 'CHAR' in datatype -%} {%- if col_size is not none -%} {%- if (col_size | int) == -1 -%} From aa6c94221fad52813a6f3079ca4d94dbf2d48508 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Mon, 11 Nov 2024 14:26:55 +0100 Subject: [PATCH 18/22] add databricks__eff_sat_v0-macro --- macros/tables/databricks/eff_sat_v0.sql | 292 ++++++++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 macros/tables/databricks/eff_sat_v0.sql diff --git a/macros/tables/databricks/eff_sat_v0.sql b/macros/tables/databricks/eff_sat_v0.sql new file mode 100644 index 00000000..7dbdd2ee --- /dev/null +++ b/macros/tables/databricks/eff_sat_v0.sql @@ -0,0 +1,292 @@ +{%- macro databricks__eff_sat_v0(source_model, tracked_hashkey, src_ldts, src_rsrc, is_active_alias, source_is_single_batch, disable_hwm) -%} + +{%- set end_of_all_times = datavault4dbt.end_of_all_times() -%} +{%- set timestamp_format = datavault4dbt.timestamp_format() -%} + +{%- set ns = namespace(last_cte= "") -%} + +{%- set source_relation = ref(source_model) -%} + +{%- set tracked_hashkey = datavault4dbt.escape_column_names(tracked_hashkey) -%} +{%- set is_active_alias = datavault4dbt.escape_column_names(is_active_alias) -%} +{%- set src_ldts = datavault4dbt.escape_column_names(src_ldts) -%} +{%- set src_rsrc = datavault4dbt.escape_column_names(src_rsrc) -%} + +{{ log('columns to select: '~final_columns_to_select, false) }} + +{{ datavault4dbt.prepend_generated_by() }} + +WITH + +{# + In all cases, the source model is selected, and optionally a HWM is applied. +#} +source_data AS ( + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }} + FROM {{ source_relation }} src + WHERE {{ src_ldts }} NOT IN ('{{ datavault4dbt.beginning_of_all_times() }}', '{{ datavault4dbt.end_of_all_times() }}') + {%- if is_incremental() and not disable_hwm %} + AND src.{{ src_ldts }} > ( + SELECT + MAX({{ src_ldts }}) + FROM {{ this }} + WHERE {{ src_ldts }} != {{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} + ) + {%- endif %} +), + +{# + In all incremental cases, the current status for each hashkey is selected from the existing Effectivity Satellite. +#} +{%- if is_incremental() %} +current_status AS ( + + SELECT + {{ tracked_hashkey }}, + {{ is_active_alias }} + FROM {{ this }} + QUALIFY + ROW_NUMBER() OVER(PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) = 1 + +), +{% endif %} + +{# + This block is for multi-batch processing. +#} +{% if not source_is_single_batch %} + + {# + List of all Hashkeys with their date of first appearance in the source model. + #} + hashkeys AS ( + + SELECT + {{ tracked_hashkey }}, + MIN({{ src_ldts }}) as first_appearance + FROM source_data + GROUP BY {{ tracked_hashkey }} + + ), + + {# + Distinct list of load dates in the multi-batch source. + #} + load_dates AS ( + + SELECT Distinct + {{ src_ldts }} + FROM source_data + + ), + + {# + All combinations of hashkeys and loaddates, for loaddates after the first appearance of a hashkey. + #} + history AS ( + + SELECT + hk.{{ tracked_hashkey }}, + ld.{{ src_ldts }} + FROM hashkeys hk + CROSS JOIN load_dates ld + WHERE ld.{{ src_ldts }} >= hk.first_appearance + + ), + + {# + All theoretical combinations are checked against the actual occurences of hashkeys in each batch / loaddate. + If a Hashkey is part of a load/batch, is_active_alias is set to 1, because the hashkey was active in that load/batch. + If a Hashkey is not part of a load/batch, is_active_alias is set to 0, because the hashkey was not active in that load/batch. + #} + is_active AS ( + + SELECT + h.{{ tracked_hashkey }}, + h.{{ src_ldts }}, + CASE + WHEN src.{{ tracked_hashkey }} IS NULL THEN 0 + ELSE 1 + END as {{ is_active_alias }} + FROM history h + LEFT JOIN source_data src + ON src.{{ tracked_hashkey }} = h.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = h.{{ src_ldts }} + + ), + + {# + The rows are deduplicated on the is_active_alias, to only include status changes. + Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. + #} + deduplicated_incoming AS ( + + SELECT + is_active.{{ tracked_hashkey }}, + is_active.{{ src_ldts }}, + is_active.{{ is_active_alias }} + + {% if is_incremental() -%} + , ROW_NUMBER() OVER(PARTITION BY is_active.{{ tracked_hashkey }} ORDER BY is_active.{{ src_ldts }}) as rn + {%- endif %} + + FROM is_active + QUALIFY + CASE + WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END + + ), + + {% set ns.last_cte = 'deduplicated_incoming' %} + +{# + This block is for single-batch processing +#} +{% else %} + + {# + In initial loads of single-batch eff sats, every hashkey of the source is set to active. + #} + new_hashkeys AS ( + + SELECT DISTINCT + src.{{ tracked_hashkey }}, + src.{{ src_ldts }}, + 1 as {{ is_active_alias }} + FROM source_data src + + {# + For incremental runs of single-batch eff sats, only hashkeys that are not active right now are set to active. + This automatically includes totally new hashkeys, or hashkeys that are currently set to inactive. + #} + {% if is_incremental() %} + LEFT JOIN current_status cs + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND cs.{{ is_active_alias }} = 1 + WHERE cs.{{ tracked_hashkey }} IS NULL + {% endif %} + + ), + + {% set ns.last_cte = 'new_hashkeys' %} + +{% endif %} + +{# + In all incremental runs, the source needs to be scanned for all currently active hashkeys. + If they are no longer present, they will be deactived. +#} +{%- if is_incremental() %} + + {%- if not source_is_single_batch %} + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM deduplicated_incoming) ldts + ON 1 = 1 + LEFT JOIN deduplicated_incoming src + ON src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + AND src.{{ src_ldts }} = ldts.min_ldts + WHERE + cs.{{ is_active_alias }} = 1 + AND src.{{ tracked_hashkey }} IS NULL + AND ldts.min_ldts IS NOT NULL + + ), + {% else %} + disappeared_hashkeys AS ( + + SELECT DISTINCT + cs.{{ tracked_hashkey }}, + ldts.min_ldts as {{ src_ldts }}, + 0 as {{ is_active_alias }} + FROM current_status cs + LEFT JOIN ( + SELECT + MIN({{ src_ldts }}) as min_ldts + FROM source_data) ldts + ON 1 = 1 + WHERE NOT EXISTS ( + SELECT + 1 + FROM source_data src + WHERE src.{{ tracked_hashkey }} = cs.{{ tracked_hashkey }} + ) + AND cs.{{ is_active_alias }} = 1 + AND ldts.min_ldts IS NOT NULL + + ), + {% endif %} +{%- endif %} + +records_to_insert AS ( + + {# + This first part of the UNION includes: + - for single-batch loads: Only is_active_alias = 1, deactivations are handled later + - for multi-batch loads: Ativation and deactivation inside the multiple loads + #} + SELECT + di.{{ tracked_hashkey }}, + di.{{ src_ldts }}, + di.{{ is_active_alias }} + FROM {{ ns.last_cte }} di + + + {%- if is_incremental() %} + + {# + For incremental multi-batch loads, the earliest to-be inserted status is compared to the current status. + It will only be inserted if the status changed. We use the ROW_NUMBER() + #} + {%- if not source_is_single_batch %} + WHERE NOT EXISTS ( + SELECT 1 + FROM current_status + WHERE {{ datavault4dbt.multikey(tracked_hashkey, prefix=['current_status', 'di'], condition='=') }} + AND {{ datavault4dbt.multikey(is_active_alias, prefix=['current_status', 'di'], condition='=') }} + AND di.{{ src_ldts }} = (SELECT MIN({{ src_ldts }}) FROM deduplicated_incoming) + ) + AND di.{{ src_ldts }} > (SELECT MAX({{ src_ldts }}) FROM {{ this }}) + {% endif %} + + {# + For all incremental loads, the disappeared hashkeys are UNIONed. + #} + UNION + + SELECT + {{ tracked_hashkey }}, + {{ src_ldts }}, + {{ is_active_alias }} + FROM disappeared_hashkeys + + {%- endif %} + +) + +SELECT * +FROM records_to_insert ri + +{% if is_incremental() %} +WHERE NOT EXISTS ( + SELECT 1 + FROM {{ this }} t + WHERE t.{{ tracked_hashkey }} = ri.{{ tracked_hashkey }} + AND t.{{ src_ldts }} = ri.{{ src_ldts }} +) +{% endif %} + +{%- endmacro -%} \ No newline at end of file From 5504a5e8d485a88a9f7521311dcac2a338d95277 Mon Sep 17 00:00:00 2001 From: Tim Kirschke <81677440+tkirschke@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:19:44 +0100 Subject: [PATCH 19/22] Update README.md --- README.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a982a07f..32410700 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,21 @@ # datavault4dbt by [Scalefree International GmbH](https://www.scalefree.com) - +[](https://www.datavault4dbt.com/) --- ### Included Macros -- Staging Area (For Hashing, prejoins and ghost records) -- Hubs, Links & Satellites (allowing multiple deltas) -- Non-Historized Links and Satellites -- Multi-Active Satellites -- Virtualized End-Dating (in Satellites) -- Reference Hubs, - Satellites, and - Tables -- PIT Tables - - Hook for Cleaning up PITs -- Snapshot Control +- [Staging Area (For Hashing, prejoins and ghost records)](https://www.datavault4dbt.com/documentation/macro-instructions/staging/) +- [Hubs](https://www.datavault4dbt.com/documentation/macro-instructions/hubs/standard-hub/), [Links](https://www.datavault4dbt.com/documentation/macro-instructions/links/standard-link/) & [Satellites](https://www.datavault4dbt.com/documentation/macro-instructions/satellites/standard-satellite/standard-satellite-v0/) (allowing multiple deltas) +- [Non-Historized Links](https://www.datavault4dbt.com/documentation/macro-instructions/links/non-historized-link/) and [Satellites](https://www.datavault4dbt.com/documentation/macro-instructions/satellites/non-historized-satellite/) +- [Multi-Active Satellites](https://www.datavault4dbt.com/documentation/macro-instructions/satellites/multi-active-satellite/multi-active-satellite-v0/) +- [Effectivity](https://www.datavault4dbt.com/documentation/macro-instructions/satellites/effectivity-satellite/) and [Record Tracking Satellites](https://www.datavault4dbt.com/documentation/macro-instructions/satellites/record-tracking-satellite/) +- [Virtualized End-Dating (in Satellites)](https://www.datavault4dbt.com/documentation/macro-instructions/satellites/standard-satellite/standard-satellite-v1/) +- [Reference Hubs](https://www.datavault4dbt.com/documentation/macro-instructions/reference-data/reference-hub/), [- Satellites](https://www.datavault4dbt.com/documentation/macro-instructions/reference-data/reference-satellite/reference-satellite-v0/), and [- Tables](https://www.datavault4dbt.com/documentation/macro-instructions/reference-data/reference-tables/) +- [PIT Tables](https://www.datavault4dbt.com/documentation/macro-instructions/business-vault/pit/) + - [Hook for Cleaning up PITs](https://www.datavault4dbt.com/documentation/macro-instructions/business-vault/pit/hook-cleanup-pits/) +- [Snapshot Control](https://www.datavault4dbt.com/documentation/macro-instructions/business-vault/snapshot-control/snapshot-control-v0/) ### Features With datavault4dbt you will get a lot of awesome features, including: @@ -36,6 +37,7 @@ To use the macros efficiently, there are a few prerequisites you need to provide ### Resources: +- Find technical information about the macros, examples, and more, on [the official datavault4dbt Website](https://www.datavault4dbt.com/)! - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support From 7c32fcb0125a2e8d226f8f55fbf6969c44ae2c07 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:26:52 +0100 Subject: [PATCH 20/22] Update README.md, add Snap Control view link, fix wiki link --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 32410700..c8f71182 100644 --- a/README.md +++ b/README.md @@ -15,12 +15,12 @@ - [Reference Hubs](https://www.datavault4dbt.com/documentation/macro-instructions/reference-data/reference-hub/), [- Satellites](https://www.datavault4dbt.com/documentation/macro-instructions/reference-data/reference-satellite/reference-satellite-v0/), and [- Tables](https://www.datavault4dbt.com/documentation/macro-instructions/reference-data/reference-tables/) - [PIT Tables](https://www.datavault4dbt.com/documentation/macro-instructions/business-vault/pit/) - [Hook for Cleaning up PITs](https://www.datavault4dbt.com/documentation/macro-instructions/business-vault/pit/hook-cleanup-pits/) -- [Snapshot Control](https://www.datavault4dbt.com/documentation/macro-instructions/business-vault/snapshot-control/snapshot-control-v0/) +- Snapshot Control [Tables](https://www.datavault4dbt.com/documentation/macro-instructions/business-vault/snapshot-control/snapshot-control-v0/) and [Views](https://www.datavault4dbt.com/documentation/macro-instructions/business-vault/snapshot-control/snapshot-control-v1/) ### Features With datavault4dbt you will get a lot of awesome features, including: - A Data Vault 2.0 implementation congruent to the original Data Vault 2.0 definition by Dan Linstedt -- Ready for both Persistent Staging Areas and Transient Staging Areas, due to the allowance of multiple deltas in all macros, without losing any intermediate changes - Enforcing standards in naming conventions by implementing [global variables](https://github.com/ScalefreeCOM/datavault4dbt/wiki/Global-variables) for technical columns +- Ready for both Persistent Staging Areas and Transient Staging Areas, due to the allowance of multiple deltas in all macros, without losing any intermediate changes - Enforcing standards in naming conventions by implementing [global variables](https://www.datavault4dbt.com/documentation/general-usage-notes/global-variables/) for technical columns - A fully auditable solution for a Data Warehouse - Creating a centralized, snapshot-based Business interface by using a centralized snapshot table supporting logarithmic logic - A modern insert-only approach that avoids updating data @@ -81,11 +81,11 @@ For further information on how to install packages in dbt, please visit the foll [https://docs.getdbt.com/docs/building-a-dbt-project/package-management](https://docs.getdbt.com/docs/building-a-dbt-project/package-management#how-do-i-add-a-package-to-my-project) ### Global variables -datavault4dbt is highly customizable by using many global variables. Since they are applied on multiple levels, a high rate of standardization across your data vault 2.0 solution is guaranteed. The default values of those variables are set inside the packages `dbt_project.yml` and should be copied to your own `dbt_project.yml`. For an explanation of all global variables see [the wiki](https://github.com/ScalefreeCOM/datavault4dbt/wiki/Global-variables). +datavault4dbt is highly customizable by using many global variables. Since they are applied on multiple levels, a high rate of standardization across your data vault 2.0 solution is guaranteed. The default values of those variables are set inside the packages `dbt_project.yml` and should be copied to your own `dbt_project.yml`. For an explanation of all global variables see [the docs](https://www.datavault4dbt.com/documentation/general-usage-notes/global-variables/). --- ## Usage -The datavault4dbt package provides macros for Staging and Creation of all DataVault-Entities you need, to build your own DataVault2.0 solution. The usage of the macros is well-explained in the documentation: https://github.com/ScalefreeCOM/datavault4dbt/wiki +The datavault4dbt package provides macros for Staging and Creation of all DataVault-Entities you need, to build your own DataVault2.0 solution. The usage of the macros is well-explained in the [documentation]([url](https://www.datavault4dbt.com/documentation/)). --- ## Contributing From b15e3e560bfe46f77e9508755467ff0ede959caf Mon Sep 17 00:00:00 2001 From: Tim Kirschke <81677440+tkirschke@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:30:23 +0100 Subject: [PATCH 21/22] Update README.md --- README.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README.md b/README.md index c8f71182..6c4c034a 100644 --- a/README.md +++ b/README.md @@ -39,10 +39,6 @@ To use the macros efficiently, there are a few prerequisites you need to provide ### Resources: - Find technical information about the macros, examples, and more, on [the official datavault4dbt Website](https://www.datavault4dbt.com/)! - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) -- Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers -- Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support -- Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices -- Find [dbt events](https://events.getdbt.com) near you - Check out the [Scalefree-Blog](https://www.scalefree.com/blog/) - [Data-Vault 2.0 with dbt #1](https://www.scalefree.com/scalefree-newsletter/data-vault-2-0-with-dbt-part-1/) - [Data-Vault 2.0 with dbt #2](https://www.scalefree.com/scalefree-newsletter/data-vault-2-0-with-dbt-part-2/) From dbf902e5b477d5f26e01995697c678190a16cc1e Mon Sep 17 00:00:00 2001 From: Tim Kirschke <81677440+tkirschke@users.noreply.github.com> Date: Mon, 18 Nov 2024 08:23:48 +0100 Subject: [PATCH 22/22] Fixed PARTION BY Clause in Sat v0 for Fabric --- macros/tables/fabric/sat_v0.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/macros/tables/fabric/sat_v0.sql b/macros/tables/fabric/sat_v0.sql index 0e07a5aa..7223949c 100644 --- a/macros/tables/fabric/sat_v0.sql +++ b/macros/tables/fabric/sat_v0.sql @@ -63,7 +63,7 @@ latest_entries_in_sat_prep AS ( SELECT tgt.{{ parent_hashkey }}, tgt.{{ ns.hdiff_alias }}, - ROW_NUMBER() OVER(PARTITION BY tgt.{{ parent_hashkey|lower }} ORDER BY tgt.{{ src_ldts }} DESC) as rn + ROW_NUMBER() OVER(PARTITION BY tgt.{{ parent_hashkey }} ORDER BY tgt.{{ src_ldts }} DESC) as rn FROM {{ this }} tgt INNER JOIN distinct_incoming_hashkeys src ON tgt.{{ parent_hashkey }} = src.{{ parent_hashkey }} @@ -91,7 +91,7 @@ deduplicated_numbered_source_prep AS ( {{ parent_hashkey }}, {{ ns.hdiff_alias }}, {{ datavault4dbt.print_list(source_cols) }} - , LAG({{ ns.hdiff_alias }}) OVER(PARTITION BY {{ parent_hashkey|lower }} ORDER BY {{ src_ldts }}) as prev_hashdiff + , LAG({{ ns.hdiff_alias }}) OVER(PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }}) as prev_hashdiff FROM source_data ), @@ -142,4 +142,4 @@ records_to_insert AS ( SELECT * FROM records_to_insert -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%}