1
1
import json
2
2
import os
3
- import plotly .express as px
4
- import plotly .graph_objects as go
3
+
5
4
import pandas as pd
6
- from dash import Dash , dcc , html , dash_table
5
+ import plotly .express as px
6
+ from dash import Dash , dash_table , dcc , html
7
7
8
8
# Modify these paths as needed or dynamically discover them.
9
9
project_root = os .getenv ("PROJECT_ROOT" )
10
10
if not project_root :
11
11
raise ValueError
12
12
13
- METADATA_FILES = sorted ([
14
- os .path .join (project_root ,"data/v0/v0_metadata.json" ),
15
- os .path .join (project_root ,"data/v1/v1_metadata.json" ),
16
- os .path .join (project_root ,"data/v2/v2_metadata.json" ),
17
- os .path .join (project_root ,"data/v3/v3_metadata.json" ),
18
- os .path .join (project_root ,"data/v4/v4_metadata.json" ),
19
- os .path .join (project_root ,"data/v5/v5_metadata.json" ),
20
- os .path .join (project_root ,"data/v5_1/v5_1_metadata.json" ),
21
- os .path .join (project_root ,"data/v5_2/v5_2_metadata.json" ),
22
- os .path .join (project_root ,"data/v6/v6_metadata.json" ),
23
- os .path .join (project_root ,"data/v7/v7_metadata.json" ),
24
- os .path .join (project_root ,"data/v9/v9_metadata.json" ),
25
- os .path .join (project_root ,"data/v10/v10_metadata.json" ),
26
- os .path .join (project_root ,"data/v11/v11_metadata.json" ),
27
- os .path .join (project_root ,"data/v12/v12_metadata.json" ),
28
- os .path .join (project_root ,"data/v13/v13_metadata.json" ),
29
- ])
13
+ METADATA_FILES = [
14
+ os .path .join (project_root , "data/v0/v0_metadata.json" ),
15
+ os .path .join (project_root , "data/v1/v1_metadata.json" ),
16
+ os .path .join (project_root , "data/v2/v2_metadata.json" ),
17
+ os .path .join (project_root , "data/v3/v3_metadata.json" ),
18
+ os .path .join (project_root , "data/v4/v4_metadata.json" ),
19
+ os .path .join (project_root , "data/v5/v5_metadata.json" ),
20
+ os .path .join (project_root , "data/v5_1/v5_1_metadata.json" ),
21
+ os .path .join (project_root , "data/v5_2/v5_2_metadata.json" ),
22
+ os .path .join (project_root , "data/v6/v6_metadata.json" ),
23
+ os .path .join (project_root , "data/v7/v7_metadata.json" ),
24
+ os .path .join (project_root , "data/v8/v8_metadata.json" ),
25
+ os .path .join (project_root , "data/v9/v9_metadata.json" ),
26
+ os .path .join (project_root , "data/v10/v10_metadata.json" ),
27
+ os .path .join (project_root , "data/v11/v11_metadata.json" ),
28
+ os .path .join (project_root , "data/v12/v12_metadata.json" ),
29
+ os .path .join (project_root , "data/v13/v13_metadata.json" ),
30
+ ]
31
+
30
32
31
33
def load_metadata (file_paths ):
32
34
records = []
33
35
for p in file_paths :
34
- with open (p , "r" ) as f :
36
+ with open (p ) as f :
35
37
meta = json .load (f )
36
38
version = os .path .basename (p ).split ("_metadata.json" )[0 ] # e.g. "v10"
37
39
record = {
@@ -47,7 +49,8 @@ def load_metadata(file_paths):
47
49
records .append (record )
48
50
return pd .DataFrame (records )
49
51
50
- df_main = load_metadata (METADATA_FILES ).sort_values ("version" )
52
+
53
+ df_main = load_metadata (METADATA_FILES )
51
54
52
55
# Create figures
53
56
fig_file_size = px .bar (
@@ -90,31 +93,32 @@ def load_metadata(file_paths):
90
93
)
91
94
92
95
# Build detail about columns from each version
93
- # We'll flatten out each version's column metadata into a long DataFrame
94
96
details_records = []
95
97
for p in METADATA_FILES :
96
- with open (p , "r" ) as f :
98
+ with open (p ) as f :
97
99
meta = json .load (f )
98
100
version = os .path .basename (p ).split ("_metadata.json" )[0 ]
99
101
columns_meta = meta .get ("columns" , {})
100
102
for col_name , col_info in columns_meta .items ():
101
- details_records .append ({
102
- "version" : version ,
103
- "column_name" : col_name ,
104
- "data_type" : col_info .get ("data_type" ),
105
- "num_missing" : col_info .get ("num_missing" ),
106
- "unique_values" : col_info .get ("unique_values" ),
107
- "memory_usage_bytes" : col_info .get ("memory_usage_bytes" ),
108
- })
103
+ details_records .append (
104
+ {
105
+ "version" : version ,
106
+ "column_name" : col_name ,
107
+ "data_type" : col_info .get ("data_type" ),
108
+ "num_missing" : col_info .get ("num_missing" ),
109
+ "unique_values" : col_info .get ("unique_values" ),
110
+ "memory_usage_bytes" : col_info .get ("memory_usage_bytes" ),
111
+ }
112
+ )
109
113
110
- df_details = pd .DataFrame (details_records ). sort_values ([ "version" , "column_name" ])
114
+ df_details = pd .DataFrame (details_records )
111
115
112
116
fig_missing = px .box (
113
117
df_details ,
114
118
x = "version" ,
115
119
y = "num_missing" ,
116
120
points = "all" ,
117
- title = "Distribution of 'num_missing' per Column by Version"
121
+ title = "Distribution of 'num_missing' per Column by Version" ,
118
122
)
119
123
fig_missing .update_layout (xaxis_title = "Version" , yaxis_title = "num_missing" )
120
124
0 commit comments