Skip to content

Commit b2d78e6

Browse files
authored
Merge pull request #148 from chauhankaranraj/update-tbl-cur
Update table curator to work w changes in kpi mapping utils.
2 parents 183e658 + 39d4183 commit b2d78e6

File tree

1 file changed

+11
-6
lines changed

1 file changed

+11
-6
lines changed

src/components/preprocessing/table_curator.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import pandas as pd
1111
from fuzzywuzzy import fuzz
1212

13-
from src.components.utils.kpi_mapping import KPI_MAPPING, KPI_CATEGORY
13+
from src.components.utils.kpi_mapping import get_kpi_mapping_category
1414
from .base_curator import BaseCurator
1515

1616
logger = logging.getLogger(__name__)
@@ -50,7 +50,7 @@ def __init__(
5050
self.company_to_exclude = company_to_exclude
5151
random.seed(seed)
5252

53-
def run(self, extraction_folder, annotation_excels, output_folder):
53+
def run(self, extraction_folder, annotation_excels, output_folder, kpi_df):
5454
"""Create ESG table dataset.
5555
5656
It saves all examples in a csv.
@@ -69,7 +69,7 @@ def run(self, extraction_folder, annotation_excels, output_folder):
6969

7070
examples_list = []
7171
for excel_file in self.annotation_excels:
72-
examples_excel = self.process_single_annotation_file(excel_file)
72+
examples_excel = self.process_single_annotation_file(excel_file, kpi_df)
7373
examples_list.extend(examples_excel)
7474

7575
df_result = pd.DataFrame(examples_list).reset_index(drop=True)
@@ -211,7 +211,7 @@ def __obtain_filename_to_strarr(self):
211211

212212
return filename_to_stringarr
213213

214-
def __clean_annotation_file(self, df, annotation_filepath):
214+
def __clean_annotation_file(self, df, annotation_filepath, kpi_df):
215215
"""Clean annotation file.
216216
217217
Returns a clean dataframe after dropping all NaN rows,
@@ -260,6 +260,11 @@ def get_pdf_name_right(f):
260260

261261
df["source_file"] = df["source_file"].apply(get_pdf_name_right)
262262

263+
# get kpi mappings
264+
kpi_dict = get_kpi_mapping_category(kpi_df)
265+
KPI_MAPPING = kpi_dict["KPI_MAPPING"]
266+
KPI_CATEGORY = kpi_dict["KPI_CATEGORY"]
267+
263268
# kpi mapping. No need to make it as class method
264269
def map_kpi(r):
265270
try:
@@ -356,7 +361,7 @@ def __create_table_meta(self):
356361
return meta_dict
357362

358363
def process_single_annotation_file(
359-
self, annotation_filepath, sheet_name="data_ex_in_xls"
364+
self, annotation_filepath, kpi_df, sheet_name="data_ex_in_xls",
360365
):
361366
"""Create examples for a single excel file.
362367
@@ -388,7 +393,7 @@ def process_single_annotation_file(
388393
return [[]]
389394

390395
# clean dataframe
391-
df = self.__clean_annotation_file(df, annotation_filepath)
396+
df = self.__clean_annotation_file(df, annotation_filepath, kpi_df)
392397

393398
# table_meta contains {pdf_name:{page: list of table csvs, ...}, ...}
394399
table_meta = self.__create_table_meta()

0 commit comments

Comments
 (0)