apache
diff --git a/‎paimon-python/pypaimon/common/predicate.py
Lines changed: 101 additions & 0 deletions b/‎paimon-python/pypaimon/common/predicate.py
Lines changed: 101 additions & 0 deletions
diff --git a/‎paimon-python/pypaimon/manifest/manifest_file_manager.py
Lines changed: 16 additions & 16 deletions b/‎paimon-python/pypaimon/manifest/manifest_file_manager.py
Lines changed: 16 additions & 16 deletions
diff --git a/‎paimon-python/pypaimon/manifest/manifest_list_manager.py
Lines changed: 6 additions & 6 deletions b/‎paimon-python/pypaimon/manifest/manifest_list_manager.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎paimon-python/pypaimon/manifest/schema/simple_stats.py
Lines changed: 3 additions & 3 deletions b/‎paimon-python/pypaimon/manifest/schema/simple_stats.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎paimon-python/pypaimon/read/push_down_utils.py
Lines changed: 72 additions & 0 deletions b/‎paimon-python/pypaimon/read/push_down_utils.py
Lines changed: 72 additions & 0 deletions
diff --git a/‎paimon-python/pypaimon/read/reader/format_avro_reader.py
Lines changed: 8 additions & 16 deletions b/‎paimon-python/pypaimon/read/reader/format_avro_reader.py
Lines changed: 8 additions & 16 deletions
@@ -82,6 +82,107 @@ def test(self, record: InternalRow) -> bool:
         else:
             raise ValueError("Unsupported predicate method: {}".format(self.method))
 
+    def test_by_value(self, value: Any) -> bool:
+        if self.method == 'and':
+            return all(p.test_by_value(value) for p in self.literals)
+        if self.method == 'or':
+            t = any(p.test_by_value(value) for p in self.literals)
+            return t
+
+        if self.method == 'equal':
+            return value == self.literals[0]
+        if self.method == 'notEqual':
+            return value != self.literals[0]
+        if self.method == 'lessThan':
+            return value < self.literals[0]
+        if self.method == 'lessOrEqual':
+            return value <= self.literals[0]
+        if self.method == 'greaterThan':
+            return value > self.literals[0]
+        if self.method == 'greaterOrEqual':
+            return value >= self.literals[0]
+        if self.method == 'isNull':
+            return value is None
+        if self.method == 'isNotNull':
+            return value is not None
+        if self.method == 'startsWith':
+            if not isinstance(value, str):
+                return False
+            return value.startswith(self.literals[0])
+        if self.method == 'endsWith':
+            if not isinstance(value, str):
+                return False
+            return value.endswith(self.literals[0])
+        if self.method == 'contains':
+            if not isinstance(value, str):
+                return False
+            return self.literals[0] in value
+        if self.method == 'in':
+            return value in self.literals
+        if self.method == 'notIn':
+            return value not in self.literals
+        if self.method == 'between':
+            return self.literals[0] <= value <= self.literals[1]
+
+        raise ValueError(f"Unsupported predicate method: {self.method}")
+
+    def test_by_stats(self, stat: dict) -> bool:
+        if self.method == 'and':
+            return all(p.test_by_stats(stat) for p in self.literals)
+        if self.method == 'or':
+            t = any(p.test_by_stats(stat) for p in self.literals)
+            return t
+
+        null_count = stat["null_counts"][self.field]
+        row_count = stat["row_count"]
+
+        if self.method == 'isNull':
+            return null_count is not None and null_count > 0
+        if self.method == 'isNotNull':
+            return null_count is None or row_count is None or null_count < row_count
+
+        min_value = stat["min_values"][self.field]
+        max_value = stat["max_values"][self.field]
+
+        if min_value is None or max_value is None or (null_count is not None and null_count == row_count):
+            return False
+
+        if self.method == 'equal':
+            return min_value <= self.literals[0] <= max_value
+        if self.method == 'notEqual':
+            return not (min_value == self.literals[0] == max_value)
+        if self.method == 'lessThan':
+            return self.literals[0] > min_value
+        if self.method == 'lessOrEqual':
+            return self.literals[0] >= min_value
+        if self.method == 'greaterThan':
+            return self.literals[0] < max_value
+        if self.method == 'greaterOrEqual':
+            return self.literals[0] <= max_value
+        if self.method == 'startsWith':
+            if not isinstance(min_value, str) or not isinstance(max_value, str):
+                raise RuntimeError("startsWith predicate on non-str field")
+            return ((min_value.startswith(self.literals[0]) or min_value < self.literals[0])
+                    and (max_value.startswith(self.literals[0]) or max_value > self.literals[0]))
+        if self.method == 'endsWith':
+            return True
+        if self.method == 'contains':
+            return True
+        if self.method == 'in':
+            for literal in self.literals:
+                if min_value <= literal <= max_value:
+                    return True
+            return False
+        if self.method == 'notIn':
+            for literal in self.literals:
+                if min_value == literal == max_value:
+                    return False
+            return True
+        if self.method == 'between':
+            return self.literals[0] <= max_value and self.literals[1] >= min_value
+        else:
+            raise ValueError(f"Unsupported predicate method: {self.method}")
+
     def to_arrow(self) -> pyarrow_compute.Expression | bool:
         if self.method == 'equal':
             return pyarrow_dataset.field(self.field) == self.literals[0]
 
@@ -55,19 +55,19 @@ def read(self, manifest_file_name: str, shard_filter=None) -> List[ManifestEntry
             file_dict = dict(record['_FILE'])
             key_dict = dict(file_dict['_KEY_STATS'])
             key_stats = SimpleStats(
-                min_value=BinaryRowDeserializer.from_bytes(key_dict['_MIN_VALUES'],
-                                                           self.trimmed_primary_key_fields),
-                max_value=BinaryRowDeserializer.from_bytes(key_dict['_MAX_VALUES'],
-                                                           self.trimmed_primary_key_fields),
-                null_count=key_dict['_NULL_COUNTS'],
+                min_values=BinaryRowDeserializer.from_bytes(key_dict['_MIN_VALUES'],
+                                                            self.trimmed_primary_key_fields),
+                max_values=BinaryRowDeserializer.from_bytes(key_dict['_MAX_VALUES'],
+                                                            self.trimmed_primary_key_fields),
+                null_counts=key_dict['_NULL_COUNTS'],
             )
             value_dict = dict(file_dict['_VALUE_STATS'])
             value_stats = SimpleStats(
-                min_value=BinaryRowDeserializer.from_bytes(value_dict['_MIN_VALUES'],
-                                                           self.table.table_schema.fields),
-                max_value=BinaryRowDeserializer.from_bytes(value_dict['_MAX_VALUES'],
-                                                           self.table.table_schema.fields),
-                null_count=value_dict['_NULL_COUNTS'],
+                min_values=BinaryRowDeserializer.from_bytes(value_dict['_MIN_VALUES'],
+                                                            self.table.table_schema.fields),
+                max_values=BinaryRowDeserializer.from_bytes(value_dict['_MAX_VALUES'],
+                                                            self.table.table_schema.fields),
+                null_counts=value_dict['_NULL_COUNTS'],
             )
             file_meta = DataFileMeta(
                 file_name=file_dict['_FILE_NAME'],
@@ -118,14 +118,14 @@ def write(self, file_name, commit_messages: List[CommitMessage]):
                         "_MIN_KEY": BinaryRowSerializer.to_bytes(file.min_key),
                         "_MAX_KEY": BinaryRowSerializer.to_bytes(file.max_key),
                         "_KEY_STATS": {
-                            "_MIN_VALUES": BinaryRowSerializer.to_bytes(file.key_stats.min_value),
-                            "_MAX_VALUES": BinaryRowSerializer.to_bytes(file.key_stats.max_value),
-                            "_NULL_COUNTS": file.key_stats.null_count,
+                            "_MIN_VALUES": BinaryRowSerializer.to_bytes(file.key_stats.min_values),
+                            "_MAX_VALUES": BinaryRowSerializer.to_bytes(file.key_stats.max_values),
+                            "_NULL_COUNTS": file.key_stats.null_counts,
                         },
                         "_VALUE_STATS": {
-                            "_MIN_VALUES": BinaryRowSerializer.to_bytes(file.value_stats.min_value),
-                            "_MAX_VALUES": BinaryRowSerializer.to_bytes(file.value_stats.max_value),
-                            "_NULL_COUNTS": file.value_stats.null_count,
+                            "_MIN_VALUES": BinaryRowSerializer.to_bytes(file.value_stats.min_values),
+                            "_MAX_VALUES": BinaryRowSerializer.to_bytes(file.value_stats.max_values),
+                            "_NULL_COUNTS": file.value_stats.null_counts,
                         },
                         "_MIN_SEQUENCE_NUMBER": file.min_sequence_number,
                         "_MAX_SEQUENCE_NUMBER": file.max_sequence_number,
 
@@ -58,15 +58,15 @@ def read(self, manifest_list_name: str) -> List[ManifestFileMeta]:
         for record in reader:
             stats_dict = dict(record['_PARTITION_STATS'])
             partition_stats = SimpleStats(
-                min_value=BinaryRowDeserializer.from_bytes(
+                min_values=BinaryRowDeserializer.from_bytes(
                     stats_dict['_MIN_VALUES'],
                     self.table.table_schema.get_partition_key_fields()
                 ),
-                max_value=BinaryRowDeserializer.from_bytes(
+                max_values=BinaryRowDeserializer.from_bytes(
                     stats_dict['_MAX_VALUES'],
                     self.table.table_schema.get_partition_key_fields()
                 ),
-                null_count=stats_dict['_NULL_COUNTS'],
+                null_counts=stats_dict['_NULL_COUNTS'],
             )
             manifest_file_meta = ManifestFileMeta(
                 file_name=record['_FILE_NAME'],
@@ -90,9 +90,9 @@ def write(self, file_name, manifest_file_metas: List[ManifestFileMeta]):
                 "_NUM_ADDED_FILES": meta.num_added_files,
                 "_NUM_DELETED_FILES": meta.num_deleted_files,
                 "_PARTITION_STATS": {
-                    "_MIN_VALUES": BinaryRowSerializer.to_bytes(meta.partition_stats.min_value),
-                    "_MAX_VALUES": BinaryRowSerializer.to_bytes(meta.partition_stats.max_value),
-                    "_NULL_COUNTS": meta.partition_stats.null_count,
+                    "_MIN_VALUES": BinaryRowSerializer.to_bytes(meta.partition_stats.min_values),
+                    "_MAX_VALUES": BinaryRowSerializer.to_bytes(meta.partition_stats.max_values),
+                    "_NULL_COUNTS": meta.partition_stats.null_counts,
                 },
                 "_SCHEMA_ID": meta.schema_id,
             }
 
@@ -24,9 +24,9 @@
 
 @dataclass
 class SimpleStats:
-    min_value: BinaryRow
-    max_value: BinaryRow
-    null_count: Optional[List[int]]
+    min_values: BinaryRow
+    max_values: BinaryRow
+    null_counts: Optional[List[int]]
 
 
 SIMPLE_STATS_SCHEMA = {
 
@@ -0,0 +1,72 @@
+################################################################################
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from typing import List, Set
+
+from pypaimon.common.predicate import Predicate
+
+
+def extract_predicate_to_list(result: list, input_predicate: 'Predicate', keys: List[str]):
+    if not input_predicate or not keys:
+        return
+
+    if input_predicate.method == 'and':
+        for sub_predicate in input_predicate.literals:
+            extract_predicate_to_list(result, sub_predicate, keys)
+        return
+    elif input_predicate.method == 'or':
+        # condition: involved keys all belong to primary keys
+        involved_fields = _get_all_fields(input_predicate)
+        if involved_fields and involved_fields.issubset(keys):
+            result.append(input_predicate)
+        return
+
+    if input_predicate.field in keys:
+        result.append(input_predicate)
+
+
+def _get_all_fields(predicate: 'Predicate') -> Set[str]:
+    if predicate.field is not None:
+        return {predicate.field}
+    involved_fields = set()
+    if predicate.literals:
+        for sub_predicate in predicate.literals:
+            involved_fields.update(_get_all_fields(sub_predicate))
+    return involved_fields
+
+
+def extract_predicate_to_dict(result: dict, input_predicate: 'Predicate', keys: List[str]):
+    if not input_predicate or not keys:
+        return
+
+    if input_predicate.method == 'and':
+        for sub_predicate in input_predicate.literals:
+            extract_predicate_to_dict(result, sub_predicate, keys)
+        return
+    elif input_predicate.method == 'or':
+        # ensure no recursive and/or
+        if not input_predicate.literals or any(p.field is None for p in input_predicate.literals):
+            return
+        # condition: only one key for 'or', and the key belongs to keys
+        involved_fields = {p.field for p in input_predicate.literals}
+        if len(involved_fields) == 1 and (field := involved_fields.pop()) in keys:
+            result[field].append(input_predicate)
+        return
+
+    if input_predicate.field in keys:
+        result[input_predicate.field].append(input_predicate)
@@ -20,11 +20,11 @@
 
 import fastavro
 import pyarrow as pa
+import pyarrow.compute as pc
 import pyarrow.dataset as ds
 from pyarrow import RecordBatch
 
 from pypaimon.common.file_io import FileIO
-from pypaimon.common.predicate import Predicate
 from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
 from pypaimon.schema.data_types import DataField, PyarrowFieldParser
 
@@ -35,26 +35,18 @@ class FormatAvroReader(RecordBatchReader):
     provided predicate and projection, and converts Avro records to RecordBatch format.
     """
 
-    def __init__(self, file_io: FileIO, file_path: str, primary_keys: List[str],
-                 fields: List[str], full_fields: List[DataField], predicate: Predicate, batch_size: int = 4096):
+    def __init__(self, file_io: FileIO, file_path: str, read_fields: List[str], full_fields: List[DataField],
+                 push_down_predicate: pc.Expression | bool, batch_size: int = 4096):
         self._file = file_io.filesystem.open_input_file(file_path)
         self._avro_reader = fastavro.reader(self._file)
         self._batch_size = batch_size
-        self._primary_keys = primary_keys
+        self._push_down_predicate = push_down_predicate
 
-        self._fields = fields
+        self._fields = read_fields
         full_fields_map = {field.name: field for field in full_fields}
-        projected_data_fields = [full_fields_map[name] for name in fields]
+        projected_data_fields = [full_fields_map[name] for name in read_fields]
         self._schema = PyarrowFieldParser.from_paimon_schema(projected_data_fields)
 
-        if primary_keys:
-            # TODO: utilize predicate to improve performance
-            predicate = None
-        if predicate is not None:
-            self._predicate = predicate.to_arrow()
-        else:
-            self._predicate = None
-
     def read_arrow_batch(self) -> Optional[RecordBatch]:
         pydict_data = {name: [] for name in self._fields}
         records_in_batch = 0
@@ -68,12 +60,12 @@ def read_arrow_batch(self) -> Optional[RecordBatch]:
 
         if records_in_batch == 0:
             return None
-        if self._predicate is None:
+        if self._push_down_predicate is None:
             return pa.RecordBatch.from_pydict(pydict_data, self._schema)
         else:
             pa_batch = pa.Table.from_pydict(pydict_data, self._schema)
             dataset = ds.InMemoryDataset(pa_batch)
-            scanner = dataset.scanner(filter=self._predicate)
+            scanner = dataset.scanner(filter=self._push_down_predicate)
             combine_chunks = scanner.to_table().combine_chunks()
             if combine_chunks.num_rows > 0:
                 return combine_chunks.to_batches()[0]