Fix duplication of file reading for Converter subclass & Add docs and example.

NarayanSchuetz · NarayanSchuetz · commit e919f7bffce8 · 2023-05-01T11:09:12.000-07:00
diff --git a/README.md b/README.md
@@ -1,11 +1,23 @@
 # edf2parquet
 Simple utility package to convert EDF/EDF+ files into Apache Parquet format 
-while preserving the EDF file header information and signal headers metadata information.
-Currently, each signal is stored as a separate parquet file, with the option to automatically
-add a pandas readable DatetimeIndex.
+while preserving the EDF file header information and signal headers metadata information and some nice enhanced features:
+- handling of non-strictly EDF compliant .EDF headers (e.g. UTF-8 characters in the header, etc.).
+- automatic conversion of the EDF file header start date and signal sampling frequency to a pd.DatetimeIndex with the correct timezone and frequency for easy Pandas interoperability (at the cost of slightly bigger file sizes of course).
+- skipping of specific signals during conversion
+- bundling signals with the same sampling frequency into a single parquet file
+- splitting of EDF files by non-use periods (e.g. if a file consists of continuous multiple nights, and you want to split it into a single file per night)
+- compression of the resulting parquet files
+
 
 ## Installation
 
+### Requirements
+The package was tested with the pinned versions in the `requirements.txt` file.
+If something does not work, try to install this exact versions. I would particularly advise 
+to use matching or more recent versions of PyArrow and Pandas (version 2.0 is important
+as its using underlying Arrow datastructures itself, thus it will break with anything
+below 2.0, as far as I'm aware).
+
 ```bash
 pip install git+https://github.com/NarayanSchuetz/edf2parquet.git
 ```
@@ -16,7 +28,7 @@ Convert an EDF file into Apache parquet format using the EdfToParquetConverter c
 ```python
 import pytz
 
-from edf2parquet.converters import EdfToParquetConverter
+from edf2parquet.converters import EdfToParquetConverter, AdvancedEdfToParquetConverter
 
 my_edf_file_path = "path_to_my_edfile.edf"
 my_parquet_output_dir = "path_to_my_parquet_output_dir"
@@ -27,6 +39,18 @@ converter = EdfToParquetConverter(edf_file_path=my_edf_file_path,
                                   parquet_output_dir=my_parquet_output_dir,
                                   compression_codec="GZIP")
 
+converter.convert()
+
+# or alternatively using the advanced converter
+converter = AdvancedEdfToParquetConverter(edf_file_path=my_edf_file_path,  # path to the EDF file
+                                          exclude_signals=["Audio"],  # list of signals to exclude from the conversion
+                                          parquet_output_dir=my_parquet_output_dir,  # path to the output directory (will be created if not exists)
+                                          group_by_sampling_freq=True,  # whether to group signals with same sampling frequency into single parquet files
+                                          datetime_index=True,  # whether to automatically add a pd.DatetimeIndex to the resulting parquet files
+                                          local_timezone=(pytz.timezone("Europe/Zurich"), pytz.timezone("Europe/Zurich")),  # specifies the timezone of the EDF file and the timezone of the start_date in the EDF file (should be the same for most cases)
+                                          compression_codec="GZIP", # compression codec to use for the resulting parquet files
+                                          split_non_use_by_col="MY_COLUMN")  # only specify this if you want to split the EDF file by non-use periods (e.g. if a file consists of continuous multiple nights and you want to split it into a single file per night) -> read the docstring of the AdvancedEdfToParquetConverter class for more information. The column specifies the column to use for splitting the file.
+
 converter.convert()
 ```
 ### Reading:
@@ -62,8 +86,7 @@ reader.get_signal_headers()
 Check the `examples.ipynb` notebook for detailed outputs.
 
 ## Todo
-- [ ] Allow to bundle signals with the same sampling rate into a single parquet file.
+- [x] Allow to bundle signals with the same sampling rate into a single parquet file.
 - [ ] Provide a high level user API
 - [ ] Enable (possibly distributed) parallel processing to efficiently convert a whole directory of EDF files.
-- [ ] Provide a high level API to convert EDF files with the same sampling frequency (fs) into a single parquet file with a single row per signal.
 
diff --git a/edf2parquet/converters.py b/edf2parquet/converters.py
@@ -50,7 +50,7 @@ def __init__(
         """
         self._datetime_index = datetime_index
         self._default_signal_dtype = default_signal_dtype
-        self._edf_file = pyedflib.EdfReader(edf_file_path)
+        self._edf_reader = EdfReader(edf_file_path)
         self._parquet_output_dir = parquet_output_dir
         self._compression_codec = compression_codec
         self._local_timezone = local_timezone
@@ -61,6 +61,10 @@ def __del__(self) -> None:
     def __repr__(self) -> str:
         return f"EdfToParquetConverter({self._edf_file.getHeader()})"
 
+    @property
+    def _edf_file(self):
+        return self._edf_reader.edf_file
+
     def convert(self) -> Optional[Dict[str, pa.Table]]:
         """
         Converts an EDF/EDF+ file to Apache Parquet file format.
@@ -241,7 +245,6 @@ def __init__(
         """
         super().__init__(edf_file_path, datetime_index, default_signal_dtype, parquet_output_dir,
                          compression_codec, local_timezone)
-        self._edf_reader = EdfReader(edf_file_path)
         self._group_by_sampling_freq = group_by_sampling_freq
         self._exclude_signals = exclude_signals
         self._split_non_use_by_col = split_non_use_by_col
diff --git a/examples.ipynb b/examples.ipynb
@@ -34,6 +34,41 @@
     "converter.convert()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Read an EDF file and convert it to Parquet files using the AdvancedEdfToParquetConverter class directly."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "from edf2parquet.converters import AdvancedEdfToParquetConverter\n",
+    "import pytz\n",
+    "\n",
+    "my_edf_file = \"path_to_my_edfile.edf\"  # REPLACE WITH YOUR EDF FILE PATH\n",
+    "my_parquet_output_dir = \"path_to_my_parquet_output_dir\"  # REPLACE WITH YOUR PARQUET OUTPUT DIRECTORY\n",
+    "\n",
+    "converter = AdvancedEdfToParquetConverter(edf_file_path=my_edf_file,  # path to the EDF file\n",
+    "                                          exclude_signals=[\"Audio\"],  # list of signals to exclude from the conversion\n",
+    "                                          parquet_output_dir=my_parquet_output_dir,  # path to the output directory (will be created if not exists)\n",
+    "                                          group_by_sampling_freq=True,  # whether to group signals with same sampling frequency into single parquet files\n",
+    "                                          datetime_index=True,  # whether to automatically add a pd.DatetimeIndex to the resulting parquet files\n",
+    "                                          local_timezone=(pytz.timezone(\"Europe/Zurich\"), pytz.timezone(\"Europe/Zurich\")),  # specifies the timezone of the EDF file and the timezone of the start_date in the EDF file (should be the same for most cases)\n",
+    "                                          compression_codec=\"GZIP\", # compression codec to use for the resulting parquet files\n",
+    "                                          split_non_use_by_col=\"MY_COLUMN\")  # only specify this if you want to split the EDF file by non-use periods (e.g. if a file consists of continuous multiple nights and you want to split it into a single file per night) -> read the docstring of the AdvancedEdfToParquetConverter class for more information. The column specifies the column to use for splitting the file.\n",
+    "\n",
+    "converter.convert()"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -220,19 +255,6 @@
      "end_time": "2023-04-23T18:37:56.725993Z"
     }
    }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "outputs": [],
-   "source": [],
-   "metadata": {
-    "collapsed": false,
-    "ExecuteTime": {
-     "start_time": "2023-04-25T17:04:41.598905Z",
-     "end_time": "2023-04-25T17:04:41.602388Z"
-    }
-   }
   }
  ],
  "metadata": {
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 MAJOR = "0"
 MINOR = "1"
-PATCH = "1"
+PATCH = "2"
 
 _VERSION_TAG = "{MAJOR}.{MINOR}.{PATCH}".format(MAJOR=MAJOR, MINOR=MINOR, PATCH=PATCH)
 
@@ -12,8 +12,8 @@
 
 
 def get_version():
-    import subprocess
-    commit_hash = str(subprocess.run(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE).stdout)[2:-3]
+    # import subprocess
+    # commit_hash = str(subprocess.run(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE).stdout)[2:-3]
     return '{VERSION_TAG}'.format(VERSION_TAG=_VERSION_TAG)
 
 
@@ -30,9 +30,9 @@ def get_version():
     install_requires=[
         'pytest',
         'numpy',
-        'pandas',
+        'pandas>=2.0.0',
         'pyarrow',
-        'pyedflib==0.1.19',  # newer versions
+        'pyedflib>=0.1.32',
     ],
     setup_requires=[
         'pytest-runner',