Skip to content

Commit 8690c44

Browse files
Make use of AUTOMATION env var consistent across demo2 notebooks (#167)
* These changes make use of AUTOMATION env var consistent across notebooks. * Restore previous contents so we can fix CL1->CL2 references and Demo3 refactor as separate PRs. * Run notebooks preserving output cells (using samples_1 instead of samples_145 S3 pipeline_run folder).
1 parent 00e8734 commit 8690c44

File tree

6 files changed

+1413
-633
lines changed

6 files changed

+1413
-633
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ The following demos provide examples of how to use the tools available with [Ope
99
* [Ingest raw data from S3 as tables on Trino](notebooks/demo1/demo1-create-tables.ipynb)
1010
* [Run SQL queries from a Jupyter Notebook environment](notebooks/demo1/demo1-join-tables.ipynb)
1111
* [Demo 1 Elyra Pipeline](https://github.com/os-climate/aicoe-osc-demo/blob/master/notebooks/demo1/demo1.pipeline)
12-
* [Results visualized on a Superset Dashboard](https://superset-secure-odh-superset.apps.odh-cl1.apps.os-climate.org/superset/dashboard/3/)
12+
* [Results visualized on a Superset Dashboard](https://superset-secure-odh-superset.apps.odh-cl1.apps.os-climate.org/superset/dashboard/3)
1313
* [Video on creating Elyra Pipelines and Superset Dashboard](https://youtu.be/TFgsR7UlcHA)
1414

1515

notebooks/demo2/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
BASE_INFER_KPI_FOLDER = DATA_FOLDER / "infer_KPI"
2222

2323
CHECKPOINT_S3_PREFIX = "aicoe-osc-demo/saved_models"
24-
DATA_S3_PREFIX = "aicoe-osc-demo/pipeline_run/samples_145"
24+
DATA_S3_PREFIX = "aicoe-osc-demo/pipeline_run/samples_1"
2525
BASE_PDF_S3_PREFIX = f"{DATA_S3_PREFIX}/pdfs"
2626
BASE_ANNOTATION_S3_PREFIX = f"{DATA_S3_PREFIX}/annotations"
2727
BASE_EXTRACTION_S3_PREFIX = f"{DATA_S3_PREFIX}/extraction"

notebooks/demo2/create_results_table.ipynb

Lines changed: 75 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@
6565
"outputs": [],
6666
"source": [
6767
"# Load credentials\n",
68-
"dotenv_dir = \"/opt/app-root/src/aicoe-osc-demo\"\n",
68+
"dotenv_dir = os.environ.get(\n",
69+
" \"CREDENTIAL_DOTENV_DIR\", os.environ.get(\"PWD\", \"/opt/app-root/src\")\n",
70+
")\n",
6971
"dotenv_path = pathlib.Path(dotenv_dir) / \"credentials.env\"\n",
7072
"if os.path.exists(dotenv_path):\n",
7173
" load_dotenv(dotenv_path=dotenv_path, override=True)"
@@ -106,7 +108,13 @@
106108
"source": [
107109
"if os.getenv(\"AUTOMATION\"):\n",
108110
" if not os.path.exists(config.BASE_INFER_KPI_FOLDER):\n",
109-
" pathlib.Path(config.BASE_INFER_KPI_FOLDER).mkdir(parents=True, exist_ok=True)"
111+
" pathlib.Path(config.BASE_INFER_KPI_FOLDER).mkdir(parents=True, exist_ok=True)\n",
112+
"\n",
113+
" # Download a sample dataset file from s3\n",
114+
" s3c.download_files_in_prefix_to_dir(\n",
115+
" s3_prefix=config.BASE_INFER_KPI_S3_PREFIX,\n",
116+
" destination_dir=config.BASE_INFER_KPI_FOLDER\n",
117+
" )"
110118
]
111119
},
112120
{
@@ -151,66 +159,66 @@
151159
" <tbody>\n",
152160
" <tr>\n",
153161
" <th>0</th>\n",
154-
" <td>413749035_Eversource Energy_2019-12-31</td>\n",
162+
" <td>sustainability-report-2019</td>\n",
155163
" <td>In which year was the annual report or the sus...</td>\n",
156164
" <td>&lt;NA&gt;</td>\n",
157165
" <td>2019</td>\n",
158-
" <td>7</td>\n",
159-
" <td>• Our core utility operations performed very w...</td>\n",
166+
" <td>3</td>\n",
167+
" <td>This report focuses on the sustainability topi...</td>\n",
160168
" <td>Text</td>\n",
161-
" <td>13.372849</td>\n",
162-
" <td>-10.76948</td>\n",
163-
" <td>-25.76948</td>\n",
169+
" <td>12.819071</td>\n",
170+
" <td>-11.384018</td>\n",
171+
" <td>-26.384018</td>\n",
164172
" </tr>\n",
165173
" <tr>\n",
166174
" <th>1</th>\n",
167-
" <td>413749035_Eversource Energy_2019-12-31</td>\n",
175+
" <td>sustainability-report-2019</td>\n",
168176
" <td>In which year was the annual report or the sus...</td>\n",
169177
" <td>&lt;NA&gt;</td>\n",
170-
" <td>2019</td>\n",
171-
" <td>34</td>\n",
172-
" <td>The American Council for an Energy-Efficient E...</td>\n",
178+
" <td>2018</td>\n",
179+
" <td>7</td>\n",
180+
" <td>According to IPCC’s 1.5 C report from 2018 and...</td>\n",
173181
" <td>Text</td>\n",
174-
" <td>12.66205</td>\n",
175-
" <td>-9.417558</td>\n",
176-
" <td>-24.417558</td>\n",
182+
" <td>12.50875</td>\n",
183+
" <td>-6.967497</td>\n",
184+
" <td>-21.967497</td>\n",
177185
" </tr>\n",
178186
" <tr>\n",
179187
" <th>2</th>\n",
180-
" <td>413749035_Eversource Energy_2019-12-31</td>\n",
188+
" <td>sustainability-report-2019</td>\n",
181189
" <td>In which year was the annual report or the sus...</td>\n",
182190
" <td>&lt;NA&gt;</td>\n",
183191
" <td>2019</td>\n",
184-
" <td>12</td>\n",
185-
" <td>The Eversource Internal Audit Department perfo...</td>\n",
192+
" <td>26</td>\n",
193+
" <td>Equinor Sustainability report 2019 High value ...</td>\n",
186194
" <td>Text</td>\n",
187-
" <td>12.373636</td>\n",
188-
" <td>-10.899869</td>\n",
189-
" <td>-25.899869</td>\n",
195+
" <td>12.427496</td>\n",
196+
" <td>-9.680325</td>\n",
197+
" <td>-24.680325</td>\n",
190198
" </tr>\n",
191199
" <tr>\n",
192200
" <th>3</th>\n",
193-
" <td>413749035_Eversource Energy_2019-12-31</td>\n",
201+
" <td>sustainability-report-2019</td>\n",
194202
" <td>In which year was the annual report or the sus...</td>\n",
195203
" <td>&lt;NA&gt;</td>\n",
196204
" <td>2019</td>\n",
197-
" <td>118</td>\n",
198-
" <td>These are referenced throughout our 2019 Susta...</td>\n",
205+
" <td>8</td>\n",
206+
" <td>Equinor Sustainability report 2019Low carbon —...</td>\n",
199207
" <td>Text</td>\n",
200-
" <td>12.245757</td>\n",
201-
" <td>-10.556628</td>\n",
202-
" <td>-25.556628</td>\n",
208+
" <td>12.356202</td>\n",
209+
" <td>-8.748007</td>\n",
210+
" <td>-23.748007</td>\n",
203211
" </tr>\n",
204212
" <tr>\n",
205213
" <th>4</th>\n",
206-
" <td>413749035_Eversource Energy_2019-12-31</td>\n",
214+
" <td>sustainability-report-2019</td>\n",
207215
" <td>What is the annual total production from coal?</td>\n",
208216
" <td>&lt;NA&gt;</td>\n",
209217
" <td>no_answer</td>\n",
210218
" <td>&lt;NA&gt;</td>\n",
211219
" <td>&lt;NA&gt;</td>\n",
212220
" <td>Text</td>\n",
213-
" <td>2.720188</td>\n",
221+
" <td>2.840454</td>\n",
214222
" <td>&lt;NA&gt;</td>\n",
215223
" <td>&lt;NA&gt;</td>\n",
216224
" </tr>\n",
@@ -219,32 +227,32 @@
219227
"</div>"
220228
],
221229
"text/plain": [
222-
" pdf_name \\\n",
223-
"0 413749035_Eversource Energy_2019-12-31 \n",
224-
"1 413749035_Eversource Energy_2019-12-31 \n",
225-
"2 413749035_Eversource Energy_2019-12-31 \n",
226-
"3 413749035_Eversource Energy_2019-12-31 \n",
227-
"4 413749035_Eversource Energy_2019-12-31 \n",
230+
" pdf_name \\\n",
231+
"0 sustainability-report-2019 \n",
232+
"1 sustainability-report-2019 \n",
233+
"2 sustainability-report-2019 \n",
234+
"3 sustainability-report-2019 \n",
235+
"4 sustainability-report-2019 \n",
228236
"\n",
229237
" kpi kpi_id answer page \\\n",
230-
"0 In which year was the annual report or the sus... <NA> 2019 7 \n",
231-
"1 In which year was the annual report or the sus... <NA> 2019 34 \n",
232-
"2 In which year was the annual report or the sus... <NA> 2019 12 \n",
233-
"3 In which year was the annual report or the sus... <NA> 2019 118 \n",
238+
"0 In which year was the annual report or the sus... <NA> 2019 3 \n",
239+
"1 In which year was the annual report or the sus... <NA> 2018 7 \n",
240+
"2 In which year was the annual report or the sus... <NA> 2019 26 \n",
241+
"3 In which year was the annual report or the sus... <NA> 2019 8 \n",
234242
"4 What is the annual total production from coal? <NA> no_answer <NA> \n",
235243
"\n",
236244
" paragraph source score \\\n",
237-
"0 • Our core utility operations performed very w... Text 13.372849 \n",
238-
"1 The American Council for an Energy-Efficient E... Text 12.66205 \n",
239-
"2 The Eversource Internal Audit Department perfo... Text 12.373636 \n",
240-
"3 These are referenced throughout our 2019 Susta... Text 12.245757 \n",
241-
"4 <NA> Text 2.720188 \n",
245+
"0 This report focuses on the sustainability topi... Text 12.819071 \n",
246+
"1 According to IPCC’s 1.5 C report from 2018 and... Text 12.50875 \n",
247+
"2 Equinor Sustainability report 2019 High value ... Text 12.427496 \n",
248+
"3 Equinor Sustainability report 2019Low carbon —... Text 12.356202 \n",
249+
"4 <NA> Text 2.840454 \n",
242250
"\n",
243251
" no_ans_score no_answer_score_plus_boost \n",
244-
"0 -10.76948 -25.76948 \n",
245-
"1 -9.417558 -24.417558 \n",
246-
"2 -10.899869 -25.899869 \n",
247-
"3 -10.556628 -25.556628 \n",
252+
"0 -11.384018 -26.384018 \n",
253+
"1 -6.967497 -21.967497 \n",
254+
"2 -9.680325 -24.680325 \n",
255+
"3 -8.748007 -23.748007 \n",
248256
"4 <NA> <NA> "
249257
]
250258
},
@@ -254,12 +262,6 @@
254262
}
255263
],
256264
"source": [
257-
"# Download a sample dataset file from s3\n",
258-
"s3c.download_files_in_prefix_to_dir(\n",
259-
" s3_prefix=config.BASE_INFER_KPI_S3_PREFIX,\n",
260-
" destination_dir=config.BASE_INFER_KPI_FOLDER\n",
261-
")\n",
262-
"\n",
263265
"all_files = glob.glob(str(config.BASE_INFER_KPI_FOLDER / \"*.csv\"))\n",
264266
"list_of_files = []\n",
265267
"\n",
@@ -315,22 +317,22 @@
315317
"output_type": "stream",
316318
"text": [
317319
"<class 'pandas.core.frame.DataFrame'>\n",
318-
"RangeIndex: 689 entries, 0 to 688\n",
320+
"RangeIndex: 96 entries, 0 to 95\n",
319321
"Data columns (total 10 columns):\n",
320322
" # Column Non-Null Count Dtype \n",
321323
"--- ------ -------------- ----- \n",
322-
" 0 pdf_name 689 non-null string \n",
323-
" 1 kpi 689 non-null string \n",
324+
" 0 pdf_name 96 non-null string \n",
325+
" 1 kpi 96 non-null string \n",
324326
" 2 kpi_id 0 non-null Int64 \n",
325-
" 3 answer 689 non-null string \n",
326-
" 4 page 555 non-null Int64 \n",
327-
" 5 paragraph 555 non-null string \n",
328-
" 6 source 689 non-null string \n",
329-
" 7 score 689 non-null Float64\n",
330-
" 8 no_ans_score 555 non-null Float64\n",
331-
" 9 no_answer_score_plus_boost 555 non-null Float64\n",
327+
" 3 answer 96 non-null string \n",
328+
" 4 page 79 non-null Int64 \n",
329+
" 5 paragraph 79 non-null string \n",
330+
" 6 source 96 non-null string \n",
331+
" 7 score 96 non-null Float64\n",
332+
" 8 no_ans_score 79 non-null Float64\n",
333+
" 9 no_answer_score_plus_boost 79 non-null Float64\n",
332334
"dtypes: Float64(3), Int64(2), string(5)\n",
333-
"memory usage: 57.3 KB\n"
335+
"memory usage: 8.1 KB\n"
334336
]
335337
}
336338
],
@@ -359,15 +361,6 @@
359361
"name": "stdout",
360362
"output_type": "stream",
361363
"text": [
362-
"200\n",
363-
"200\n",
364-
"200\n",
365-
"200\n",
366-
"200\n",
367-
"200\n",
368-
"200\n",
369-
"200\n",
370-
"200\n",
371364
"200\n"
372365
]
373366
}
@@ -464,16 +457,16 @@
464457
{
465458
"data": {
466459
"text/plain": [
467-
"['sustainability-report-2019',\n",
460+
"['90044053_Fisher & Paykel Hl_2017-11-07',\n",
468461
" 'In which year was the annual report or the sustainability report published?',\n",
469462
" None,\n",
470-
" '2019',\n",
471-
" 26,\n",
472-
" 'Equinor Sustainability report 2019 High value — creating shared value',\n",
463+
" '2017',\n",
464+
" 2,\n",
465+
" 'Corporate Responsibility and Sustainability Report 2017Fisher & Paykel Healthcare Corporation Limited',\n",
473466
" 'Text',\n",
474-
" 12.427505493164062,\n",
475-
" -9.680328369140623,\n",
476-
" -24.680328369140625]"
467+
" 11.549626350402832,\n",
468+
" -8.787019729614258,\n",
469+
" -23.787019729614254]"
477470
]
478471
},
479472
"execution_count": 11,
@@ -514,7 +507,7 @@
514507
"name": "python",
515508
"nbconvert_exporter": "python",
516509
"pygments_lexer": "ipython3",
517-
"version": "3.8.6"
510+
"version": "3.8.8"
518511
}
519512
},
520513
"nbformat": 4,

0 commit comments

Comments
 (0)