|
8 | 8 |
|
9 | 9 | from linkml_store.api import Collection
|
10 | 10 | from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
|
11 |
| -from linkml_store.api.queries import Query |
| 11 | +from linkml_store.api.queries import Query, QueryResult |
12 | 12 | from linkml_store.api.stores.duckdb.mappings import TMAP
|
13 | 13 | from linkml_store.utils.sql_utils import facet_count_sql
|
14 | 14 |
|
@@ -145,6 +145,166 @@ def _check_if_initialized(self) -> bool:
|
145 | 145 | return True
|
146 | 146 | return False
|
147 | 147 |
|
| 148 | + def group_by( |
| 149 | + self, |
| 150 | + group_by_fields: List[str], |
| 151 | + inlined_field="objects", |
| 152 | + agg_map: Optional[Dict[str, str]] = None, |
| 153 | + where: Optional[Dict] = None, |
| 154 | + **kwargs, |
| 155 | + ) -> QueryResult: |
| 156 | + """ |
| 157 | + Group objects in the collection by specified fields using SQLAlchemy. |
| 158 | + |
| 159 | + This implementation leverages DuckDB's SQL capabilities for more efficient grouping. |
| 160 | + |
| 161 | + :param group_by_fields: List of fields to group by |
| 162 | + :param inlined_field: Field name to store aggregated objects |
| 163 | + :param agg_map: Dictionary mapping aggregation types to fields |
| 164 | + :param where: Filter conditions |
| 165 | + :param kwargs: Additional arguments |
| 166 | + :return: Query result containing grouped data |
| 167 | + """ |
| 168 | + if isinstance(group_by_fields, str): |
| 169 | + group_by_fields = [group_by_fields] |
| 170 | + |
| 171 | + cd = self.class_definition() |
| 172 | + if not cd: |
| 173 | + logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}") |
| 174 | + return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs) |
| 175 | + |
| 176 | + # Check if the table exists |
| 177 | + if not self.parent._table_exists(self.alias): |
| 178 | + logger.debug(f"Table {self.alias} doesn't exist, falling back to parent implementation") |
| 179 | + return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs) |
| 180 | + |
| 181 | + # Get table definition |
| 182 | + table = self._sqla_table(cd) |
| 183 | + engine = self.parent.engine |
| 184 | + |
| 185 | + # Create a SQLAlchemy select statement for groups |
| 186 | + from sqlalchemy import select, func, and_, or_ |
| 187 | + group_cols = [table.c[field] for field in group_by_fields if field in table.columns.keys()] |
| 188 | + |
| 189 | + if not group_cols: |
| 190 | + logger.warning(f"None of the group_by fields {group_by_fields} found in table columns") |
| 191 | + return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs) |
| 192 | + |
| 193 | + stmt = select(*group_cols).distinct() |
| 194 | + |
| 195 | + # Add where conditions if specified |
| 196 | + if where: |
| 197 | + conditions = [] |
| 198 | + for k, v in where.items(): |
| 199 | + if k in table.columns.keys(): |
| 200 | + # Handle different operator types (dict values for operators) |
| 201 | + if isinstance(v, dict): |
| 202 | + for op, val in v.items(): |
| 203 | + if op == "$gt": |
| 204 | + conditions.append(table.c[k] > val) |
| 205 | + elif op == "$gte": |
| 206 | + conditions.append(table.c[k] >= val) |
| 207 | + elif op == "$lt": |
| 208 | + conditions.append(table.c[k] < val) |
| 209 | + elif op == "$lte": |
| 210 | + conditions.append(table.c[k] <= val) |
| 211 | + elif op == "$ne": |
| 212 | + conditions.append(table.c[k] != val) |
| 213 | + elif op == "$in": |
| 214 | + conditions.append(table.c[k].in_(val)) |
| 215 | + else: |
| 216 | + # Default to equality for unknown operators |
| 217 | + logger.warning(f"Unknown operator {op}, using equality") |
| 218 | + conditions.append(table.c[k] == val) |
| 219 | + else: |
| 220 | + # Direct equality comparison |
| 221 | + conditions.append(table.c[k] == v) |
| 222 | + |
| 223 | + if conditions: |
| 224 | + for condition in conditions: |
| 225 | + stmt = stmt.where(condition) |
| 226 | + |
| 227 | + results = [] |
| 228 | + try: |
| 229 | + with engine.connect() as conn: |
| 230 | + # Get all distinct groups |
| 231 | + group_result = conn.execute(stmt) |
| 232 | + group_rows = list(group_result) |
| 233 | + |
| 234 | + # For each group, get all objects |
| 235 | + for group_row in group_rows: |
| 236 | + # Build conditions for this group |
| 237 | + group_conditions = [] |
| 238 | + group_dict = {} |
| 239 | + |
| 240 | + for i, field in enumerate(group_by_fields): |
| 241 | + if field in table.columns.keys(): |
| 242 | + value = group_row[i] |
| 243 | + group_dict[field] = value |
| 244 | + if value is None: |
| 245 | + group_conditions.append(table.c[field].is_(None)) |
| 246 | + else: |
| 247 | + group_conditions.append(table.c[field] == value) |
| 248 | + |
| 249 | + # Get all rows for this group |
| 250 | + row_stmt = select(*table.columns) |
| 251 | + for condition in group_conditions: |
| 252 | + row_stmt = row_stmt.where(condition) |
| 253 | + |
| 254 | + # Add original where conditions |
| 255 | + if where: |
| 256 | + for k, v in where.items(): |
| 257 | + if k in table.columns.keys(): |
| 258 | + # Handle different operator types for the row query as well |
| 259 | + if isinstance(v, dict): |
| 260 | + for op, val in v.items(): |
| 261 | + if op == "$gt": |
| 262 | + row_stmt = row_stmt.where(table.c[k] > val) |
| 263 | + elif op == "$gte": |
| 264 | + row_stmt = row_stmt.where(table.c[k] >= val) |
| 265 | + elif op == "$lt": |
| 266 | + row_stmt = row_stmt.where(table.c[k] < val) |
| 267 | + elif op == "$lte": |
| 268 | + row_stmt = row_stmt.where(table.c[k] <= val) |
| 269 | + elif op == "$ne": |
| 270 | + row_stmt = row_stmt.where(table.c[k] != val) |
| 271 | + elif op == "$in": |
| 272 | + row_stmt = row_stmt.where(table.c[k].in_(val)) |
| 273 | + else: |
| 274 | + # Default to equality for unknown operators |
| 275 | + row_stmt = row_stmt.where(table.c[k] == val) |
| 276 | + else: |
| 277 | + # Direct equality comparison |
| 278 | + row_stmt = row_stmt.where(table.c[k] == v) |
| 279 | + |
| 280 | + row_result = conn.execute(row_stmt) |
| 281 | + rows = list(row_result) |
| 282 | + |
| 283 | + # Convert rows to dictionaries |
| 284 | + objects = [] |
| 285 | + for row in rows: |
| 286 | + obj = {} |
| 287 | + for i, col in enumerate(row._fields): |
| 288 | + obj[col] = row[i] |
| 289 | + objects.append(obj) |
| 290 | + |
| 291 | + # Apply agg_map to filter fields if specified |
| 292 | + if agg_map and "list" in agg_map: |
| 293 | + list_fields = agg_map["list"] |
| 294 | + if list_fields: |
| 295 | + objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects] |
| 296 | + |
| 297 | + # Create the result object |
| 298 | + result_obj = group_dict.copy() |
| 299 | + result_obj[inlined_field] = objects |
| 300 | + results.append(result_obj) |
| 301 | + |
| 302 | + return QueryResult(num_rows=len(results), rows=results) |
| 303 | + except Exception as e: |
| 304 | + logger.warning(f"Error in DuckDB group_by: {e}") |
| 305 | + # Fall back to parent implementation |
| 306 | + return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs) |
| 307 | + |
148 | 308 | def _create_table(self, cd: ClassDefinition):
|
149 | 309 | if self._table_created or self.metadata.is_prepopulated:
|
150 | 310 | logger.info(f"Already have table for: {cd.name}")
|
|
0 commit comments