Merge pull request #44 from linkml/add-specialized-group-by-implementations

cmungall · web-flow · commit 0e5ee40ae6f6 · 2025-03-13T19:04:22.000-04:00
Add specialized group_by implementations for DuckDB and MongoDB
diff --git a/src/linkml_store/api/collection.py b/src/linkml_store/api/collection.py
@@ -641,6 +641,11 @@ def group_by(
         if isinstance(group_by_fields, str):
             group_by_fields = [group_by_fields]
         df = self.find(where=where, limit=-1).rows_dataframe
+        
+        # Handle the case where agg_map is None
+        if agg_map is None:
+            agg_map = {}
+            
         pk_fields = agg_map.get("first", []) + group_by_fields
         list_fields = agg_map.get("list", [])
         if not list_fields:
diff --git a/src/linkml_store/api/stores/duckdb/duckdb_collection.py b/src/linkml_store/api/stores/duckdb/duckdb_collection.py
@@ -8,7 +8,7 @@
 
 from linkml_store.api import Collection
 from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
-from linkml_store.api.queries import Query
+from linkml_store.api.queries import Query, QueryResult
 from linkml_store.api.stores.duckdb.mappings import TMAP
 from linkml_store.utils.sql_utils import facet_count_sql
 
@@ -145,6 +145,166 @@ def _check_if_initialized(self) -> bool:
             return True
         return False
 
+    def group_by(
+        self,
+        group_by_fields: List[str],
+        inlined_field="objects",
+        agg_map: Optional[Dict[str, str]] = None,
+        where: Optional[Dict] = None,
+        **kwargs,
+    ) -> QueryResult:
+        """
+        Group objects in the collection by specified fields using SQLAlchemy.
+        
+        This implementation leverages DuckDB's SQL capabilities for more efficient grouping.
+        
+        :param group_by_fields: List of fields to group by
+        :param inlined_field: Field name to store aggregated objects
+        :param agg_map: Dictionary mapping aggregation types to fields
+        :param where: Filter conditions
+        :param kwargs: Additional arguments
+        :return: Query result containing grouped data
+        """
+        if isinstance(group_by_fields, str):
+            group_by_fields = [group_by_fields]
+            
+        cd = self.class_definition()
+        if not cd:
+            logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}")
+            return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
+            
+        # Check if the table exists
+        if not self.parent._table_exists(self.alias):
+            logger.debug(f"Table {self.alias} doesn't exist, falling back to parent implementation")
+            return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
+            
+        # Get table definition
+        table = self._sqla_table(cd)
+        engine = self.parent.engine
+        
+        # Create a SQLAlchemy select statement for groups
+        from sqlalchemy import select, func, and_, or_
+        group_cols = [table.c[field] for field in group_by_fields if field in table.columns.keys()]
+        
+        if not group_cols:
+            logger.warning(f"None of the group_by fields {group_by_fields} found in table columns")
+            return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
+        
+        stmt = select(*group_cols).distinct()
+        
+        # Add where conditions if specified
+        if where:
+            conditions = []
+            for k, v in where.items():
+                if k in table.columns.keys():
+                    # Handle different operator types (dict values for operators)
+                    if isinstance(v, dict):
+                        for op, val in v.items():
+                            if op == "$gt":
+                                conditions.append(table.c[k] > val)
+                            elif op == "$gte":
+                                conditions.append(table.c[k] >= val)
+                            elif op == "$lt":
+                                conditions.append(table.c[k] < val)
+                            elif op == "$lte":
+                                conditions.append(table.c[k] <= val)
+                            elif op == "$ne":
+                                conditions.append(table.c[k] != val)
+                            elif op == "$in":
+                                conditions.append(table.c[k].in_(val))
+                            else:
+                                # Default to equality for unknown operators
+                                logger.warning(f"Unknown operator {op}, using equality")
+                                conditions.append(table.c[k] == val)
+                    else:
+                        # Direct equality comparison
+                        conditions.append(table.c[k] == v)
+            
+            if conditions:
+                for condition in conditions:
+                    stmt = stmt.where(condition)
+        
+        results = []
+        try:
+            with engine.connect() as conn:
+                # Get all distinct groups
+                group_result = conn.execute(stmt)
+                group_rows = list(group_result)
+                
+                # For each group, get all objects
+                for group_row in group_rows:
+                    # Build conditions for this group
+                    group_conditions = []
+                    group_dict = {}
+                    
+                    for i, field in enumerate(group_by_fields):
+                        if field in table.columns.keys():
+                            value = group_row[i]
+                            group_dict[field] = value
+                            if value is None:
+                                group_conditions.append(table.c[field].is_(None))
+                            else:
+                                group_conditions.append(table.c[field] == value)
+                    
+                    # Get all rows for this group
+                    row_stmt = select(*table.columns)
+                    for condition in group_conditions:
+                        row_stmt = row_stmt.where(condition)
+                    
+                    # Add original where conditions
+                    if where:
+                        for k, v in where.items():
+                            if k in table.columns.keys():
+                                # Handle different operator types for the row query as well
+                                if isinstance(v, dict):
+                                    for op, val in v.items():
+                                        if op == "$gt":
+                                            row_stmt = row_stmt.where(table.c[k] > val)
+                                        elif op == "$gte":
+                                            row_stmt = row_stmt.where(table.c[k] >= val)
+                                        elif op == "$lt":
+                                            row_stmt = row_stmt.where(table.c[k] < val)
+                                        elif op == "$lte":
+                                            row_stmt = row_stmt.where(table.c[k] <= val)
+                                        elif op == "$ne":
+                                            row_stmt = row_stmt.where(table.c[k] != val)
+                                        elif op == "$in":
+                                            row_stmt = row_stmt.where(table.c[k].in_(val))
+                                        else:
+                                            # Default to equality for unknown operators
+                                            row_stmt = row_stmt.where(table.c[k] == val)
+                                else:
+                                    # Direct equality comparison
+                                    row_stmt = row_stmt.where(table.c[k] == v)
+                    
+                    row_result = conn.execute(row_stmt)
+                    rows = list(row_result)
+                    
+                    # Convert rows to dictionaries
+                    objects = []
+                    for row in rows:
+                        obj = {}
+                        for i, col in enumerate(row._fields):
+                            obj[col] = row[i]
+                        objects.append(obj)
+                    
+                    # Apply agg_map to filter fields if specified
+                    if agg_map and "list" in agg_map:
+                        list_fields = agg_map["list"]
+                        if list_fields:
+                            objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
+                    
+                    # Create the result object
+                    result_obj = group_dict.copy()
+                    result_obj[inlined_field] = objects
+                    results.append(result_obj)
+                
+                return QueryResult(num_rows=len(results), rows=results)
+        except Exception as e:
+            logger.warning(f"Error in DuckDB group_by: {e}")
+            # Fall back to parent implementation
+            return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
+
     def _create_table(self, cd: ClassDefinition):
         if self._table_created or self.metadata.is_prepopulated:
             logger.info(f"Already have table for: {cd.name}")
diff --git a/src/linkml_store/api/stores/mongodb/mongodb_collection.py b/src/linkml_store/api/stores/mongodb/mongodb_collection.py
@@ -265,3 +265,101 @@ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True,
         if deleted_rows_count == 0 and not missing_ok:
             raise ValueError(f"No rows found for {where}")
         return deleted_rows_count
+        
+    def group_by(
+        self,
+        group_by_fields: List[str],
+        inlined_field="objects",
+        agg_map: Optional[Dict[str, str]] = None,
+        where: Optional[Dict] = None,
+        **kwargs,
+    ) -> QueryResult:
+        """
+        Group objects in the collection by specified fields using MongoDB's aggregation pipeline.
+        
+        This implementation leverages MongoDB's native aggregation capabilities for efficient grouping.
+        
+        :param group_by_fields: List of fields to group by
+        :param inlined_field: Field name to store aggregated objects
+        :param agg_map: Dictionary mapping aggregation types to fields
+        :param where: Filter conditions
+        :param kwargs: Additional arguments
+        :return: Query result containing grouped data
+        """
+        if isinstance(group_by_fields, str):
+            group_by_fields = [group_by_fields]
+        
+        # Build the group key for MongoDB
+        if len(group_by_fields) == 1:
+            # Single field grouping
+            group_id = f"${group_by_fields[0]}"
+        else:
+            # Multi-field grouping
+            group_id = {field: f"${field}" for field in group_by_fields}
+        
+        # Start building the pipeline
+        pipeline = []
+        
+        # Add match stage if where clause is provided
+        if where:
+            pipeline.append({"$match": where})
+        
+        # Add the group stage
+        group_stage = {
+            "$group": {
+                "_id": group_id,
+                "objects": {"$push": "$$ROOT"}
+            }
+        }
+        pipeline.append(group_stage)
+        
+        # Execute the aggregation
+        logger.debug(f"MongoDB group_by pipeline: {pipeline}")
+        aggregation_results = list(self.mongo_collection.aggregate(pipeline))
+        
+        # Transform the results to match the expected format
+        results = []
+        for result in aggregation_results:
+            # Skip null groups if needed
+            if result["_id"] is None and kwargs.get("skip_nulls", False):
+                continue
+                
+            # Create the group object
+            if isinstance(result["_id"], dict):
+                # Multi-field grouping
+                group_obj = result["_id"]
+            else:
+                # Single field grouping
+                group_obj = {group_by_fields[0]: result["_id"]}
+                
+            # Add the grouped objects
+            objects = result["objects"]
+            
+            # Remove MongoDB _id field from each object
+            for obj in objects:
+                if "_id" in obj:
+                    del obj["_id"]
+            
+            # Apply any field selection or transformations based on agg_map
+            if agg_map:
+                # Get first fields (fields to keep as single values)
+                first_fields = agg_map.get("first", [])
+                if first_fields:
+                    # These are already in the group_obj from the _id
+                    pass
+                
+                # Get list fields (fields to aggregate as lists)
+                list_fields = agg_map.get("list", [])
+                if list_fields:
+                    # Filter objects to only include specified fields
+                    objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
+                elif not list_fields and first_fields:
+                    # If list_fields is empty but first_fields is specified,
+                    # filter out first_fields from objects to avoid duplication
+                    objects = [{k: v for k, v in obj.items() if k not in first_fields} for obj in objects]
+            
+            # Add the objects to the group
+            group_obj[inlined_field] = objects
+            results.append(group_obj)
+            
+        return QueryResult(num_rows=len(results), rows=results)
diff --git a/tests/test_api/test_api.py b/tests/test_api/test_api.py
@@ -334,6 +334,90 @@ def test_group_by(handle):
             assert False, f"Unexpected id: {row['id']}"
 
 
+@pytest.mark.parametrize("handle", SCHEMES_PLUS)
+def test_group_by_advanced(handle):
+    """
+    Test more advanced group_by features for specific store implementations.
+    
+    Tests various features:
+    1. Multi-field grouping
+    2. Filtering with where clause
+    3. Aggregation of specific fields
+    4. Different inlined field name
+    """
+    client = create_client(handle)
+    database = client.get_database()
+    
+    # Create a more complex dataset with multiple grouping possibilities
+    rows = [
+        {"id": 1, "category": "A", "name": "Item1", "price": 10.0, "qty": 5, "tags": ["red", "small"]},
+        {"id": 2, "category": "A", "name": "Item2", "price": 20.0, "qty": 3, "tags": ["blue", "medium"]},
+        {"id": 3, "category": "B", "name": "Item3", "price": 15.0, "qty": 7, "tags": ["red", "large"]},
+        {"id": 4, "category": "B", "name": "Item4", "price": 25.0, "qty": 2, "tags": ["green", "small"]},
+        {"id": 5, "category": "A", "name": "Item5", "price": 30.0, "qty": 1, "tags": ["blue", "large"]},
+    ]
+    
+    collection = database.create_collection("Products", recreate_if_exists=True)
+    collection.insert(rows)
+    
+    # Test 1: Group by a single field
+    result = collection.group_by(["category"])
+    assert result.num_rows == 2
+    
+    # Verify correct grouping
+    for group in result.rows:
+        if group["category"] == "A":
+            assert len(group["objects"]) == 3
+        elif group["category"] == "B":
+            assert len(group["objects"]) == 2
+        else:
+            assert False, f"Unexpected category: {group['category']}"
+    
+    # Test 2: Group by multiple scalar fields (avoid using array fields in multi-field grouping)
+    result = collection.group_by(["category", "name"])
+    # Just check that it doesn't error - the exact results will depend on implementation
+    
+    # Test 3: Group with a where clause - use exact match for compatibility
+    # Filter for category "A" items only
+    result = collection.group_by(["category"], where={"category": "A"})
+    assert result.num_rows == 1
+    assert result.rows[0]["category"] == "A"
+    assert len(result.rows[0]["objects"]) == 3
+    
+    # For MongoDB specific test, if this is MongoDB handle
+    if "mongodb" in handle:
+        # Uses MongoDB's query operators
+        result = collection.group_by(["category"], where={"price": {"$gt": 15.0}})
+        
+        # Find the group with category "A"
+        a_group = next((g for g in result.rows if g["category"] == "A"), None)
+        if a_group is not None:
+            # Should only include items with price > 15.0
+            for item in a_group["objects"]:
+                assert item["price"] > 15.0
+    
+    # Test 4: Custom inlined field name
+    result = collection.group_by(["category"], inlined_field="items")
+    for group in result.rows:
+        assert "items" in group
+        assert "objects" not in group
+    
+    # Test 5: Test with agg_map for field selection (skip for file adapter which doesn't fully support agg_map)
+    if "file:" not in handle:
+        result = collection.group_by(
+            ["category"], 
+            agg_map={"first": ["category"], "list": ["name", "price"]}
+        )
+        
+        # Verify that only specified fields are included
+        for group in result.rows:
+            for item in group["objects"]:
+                assert "name" in item
+                assert "price" in item
+                assert "qty" not in item  # This field should be excluded
+                assert "tags" not in item  # This field should be excluded
+
+
 @pytest.mark.parametrize("handle", SCHEMES_PLUS)
 def test_collections_of_same_type(handle):
     """