made dependencies optional

cmungall · cmungall · commit 219336cb2233 · 2025-01-31T19:29:23.000-08:00
diff --git a/Makefile b/Makefile
@@ -18,6 +18,9 @@ pytest-neo4j:
 pytest-core:
 	$(RUN) pytest
 
+pytest-minimal:
+	$(RUN) pytest tests/test_api/test_filesystem_adapter.py
+
 pytest-full:
 	$(RUN) pytest -m ""
 
@@ -28,7 +31,8 @@ all-pytest:
 	$(RUN) pytest -m "integration or not integration"
 
 install-all:
-	poetry install -E analytics -E app -E tests -E llm -E mongodb
+	poetry install --all-extras
+#	poetry install -E analytics -E app -E tests -E llm -E mongodb
 
 DOCTEST_DIR = docs src/linkml_store/api src/linkml_store/index src/linkml_store/utils
 doctest:
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,6 +13,7 @@ pydantic = "^2.0.0"
 linkml-runtime = ">=1.8.0"
 streamlit = { version = "^1.32.2", optional = true }
 sqlalchemy = "*"
+google-cloud-bigquery = "*"
 duckdb = ">=0.10.1"
 duckdb-engine = ">=0.11.2"
 matplotlib = { version = "*", optional = true }
@@ -39,11 +40,14 @@ frictionless = { version="*", optional = true }
 ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
 gcsfs = { version="*", optional = true }
 multipledispatch = { version="*" }
+tabulate = "*"
 pandas = ">=2.2.1"
 jinja2 = "^3.1.4"
 jsonlines = "^4.0.0"
 fastapi = { version="*", optional = true }
 uvicorn = { version="*", optional = true }
+xmltodict = "^0.13.0"
+jsonpatch = "^1.33"
 
 [tool.poetry.group.dev.dependencies]
 pytest = {version = ">=7.1.2"}
@@ -87,6 +91,8 @@ fastapi = ["fastapi", "uvicorn"]
 frictionless = ["frictionless"]
 scipy = ["scipy", "scikit-learn"]
 ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
+bigquery = ["google-cloud-bigquery"]
+all = ["llm", "mongodb", "neo4j", "chromadb", "validation", "map", "renderer", "ibis", "bigquery"]
 
 [tool.poetry.scripts]
 linkml-store = "linkml_store.cli:cli"
diff --git a/src/linkml_store/api/client.py b/src/linkml_store/api/client.py
@@ -1,3 +1,4 @@
+import importlib
 import logging
 from pathlib import Path
 from typing import Dict, Optional, Union
@@ -7,23 +8,18 @@
 
 from linkml_store.api import Database
 from linkml_store.api.config import ClientConfig
-from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
-from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
-from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
-from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
-from linkml_store.api.stores.neo4j.neo4j_database import Neo4jDatabase
-from linkml_store.api.stores.solr.solr_database import SolrDatabase
 
 logger = logging.getLogger(__name__)
 
 
+
 HANDLE_MAP = {
-    "duckdb": DuckDBDatabase,
-    "solr": SolrDatabase,
-    "mongodb": MongoDBDatabase,
-    "chromadb": ChromaDBDatabase,
-    "neo4j": Neo4jDatabase,
-    "file": FileSystemDatabase,
+    "duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
+    "solr": "linkml_store.api.stores.solr.solr_database.SolrDatabase",
+    "mongodb": "linkml_store.api.stores.mongodb.mongodb_database.MongoDBDatabase",
+    "chromadb": "linkml_store.api.stores.chromadb.chromadb_database.ChromaDBDatabase",
+    "neo4j": "linkml_store.api.stores.neo4j.neo4j_database.Neo4jDatabase",
+    "file": "linkml_store.api.stores.filesystem.filesystem_database.FileSystemDatabase",
 }
 
 
@@ -155,6 +151,9 @@ def _initialize_databases(self, auto_attach=False, **kwargs):
             if auto_attach:
                 db = self.attach_database(handle, alias=name, **kwargs)
                 db.from_config(db_config)
+            if db_config.source:
+                db = self.get_database(name)
+                db.store(db_config.source.data)
 
     def _set_database_config(self, db: Database):
         """
@@ -207,7 +206,14 @@ def attach_database(
             scheme, _ = handle.split(":", 1)
         if scheme not in HANDLE_MAP:
             raise ValueError(f"Unknown scheme: {scheme}")
-        cls = HANDLE_MAP[scheme]
+        module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
+        try:
+            module = importlib.import_module(module_path)
+            cls = getattr(module, class_name)
+        except ImportError as e:
+            raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")
+
+        #cls = HANDLE_MAP[scheme]
         db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
         if schema_view:
             db.set_schema_view(schema_view)
diff --git a/src/linkml_store/api/collection.py b/src/linkml_store/api/collection.py
@@ -470,6 +470,7 @@ def search(
         where: Optional[Any] = None,
         index_name: Optional[str] = None,
         limit: Optional[int] = None,
+        select_cols: Optional[List[str]] = None,
         mmr_relevance_factor: Optional[float] = None,
         **kwargs,
     ) -> QueryResult:
@@ -503,6 +504,7 @@ def search(
         :param where:
         :param index_name:
         :param limit:
+        :param select_cols:
         :param kwargs:
         :return:
         """
@@ -538,6 +540,11 @@ def search(
         results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
         for r in results:
             del r[1][index_col]
+        if select_cols:
+            new_results = []
+            for r in results:
+                new_results.append((r[0], {k: v for k, v in r[1].items() if k in select_cols}))
+            results = new_results
         new_qr = QueryResult(num_rows=len(results))
         new_qr.ranked_rows = results
         new_qr.rows = [r[1] for r in results]
@@ -672,6 +679,7 @@ def rows_iter(self) -> Iterable[OBJECT]:
         """
         yield from self.find({}, limit=-1).rows
 
+    @property
     def rows(self) -> List[OBJECT]:
         """
         Return a list of objects in the collection.
diff --git a/src/linkml_store/api/config.py b/src/linkml_store/api/config.py
@@ -91,7 +91,7 @@ class CollectionConfig(ConfiguredBaseModel):
     )
     source: Optional[CollectionSource] = Field(
         default=None,
-        description="Metadata about the source",
+        description="Source for the collection",
     )
     derived_from: Optional[List[DerivationConfiguration]] = Field(
         default=None,
@@ -154,6 +154,10 @@ class DatabaseConfig(ConfiguredBaseModel):
         default=False,
         description="Whether to ensure referential integrity",
     )
+    source: Optional[CollectionSource] = Field(
+        default=None,
+        description="Source for the database",
+    )
 
 
 class ClientConfig(ConfiguredBaseModel):
diff --git a/src/linkml_store/api/stores/filesystem/filesystem_database.py b/src/linkml_store/api/stores/filesystem/filesystem_database.py
@@ -3,7 +3,7 @@
 from typing import Optional
 
 import yaml
-from linkml.utils.schema_builder import SchemaBuilder
+from linkml_runtime.utils.schema_builder import SchemaBuilder
 from linkml_runtime import SchemaView
 
 from linkml_store.api import Database
diff --git a/src/linkml_store/cli.py b/src/linkml_store/cli.py
@@ -135,12 +135,17 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
         logger.setLevel(logging.ERROR)
     ctx.ensure_object(dict)
     if input:
-        stem = underscore(Path(input).stem)
-        database = "duckdb"
-        collection = stem
+        database = "duckdb" # default: store in duckdb
+        if input.startswith("http"):
+            parts = input.split("/")
+            collection = parts[-1]
+            collection = collection.split(".")[0]
+        else:
+            stem = underscore(Path(input).stem)
+            collection = stem
+        logger.info(f"Using input file: {input}, "
+                    f"default storage is {database} and collection is {collection}")
         config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
-        # collection = Path(input).stem
-        # database = f"file:{Path(input).parent}"
     if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
         config = DEFAULT_LOCAL_CONF_PATH
     if config is None and DEFAULT_GLOBAL_CONF_PATH.exists():
@@ -178,10 +183,11 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
 
 @cli.command()
 @click.argument("files", type=click.Path(exists=True), nargs=-1)
+@click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
 @click.option("--format", "-f", type=format_choice, help="Input format")
 @click.option("--object", "-i", multiple=True, help="Input object as YAML")
 @click.pass_context
-def insert(ctx, files, object, format):
+def insert(ctx, files, replace, object, format):
     """Insert objects from files (JSON, YAML, TSV) into the specified collection.
 
     Using a configuration:
@@ -195,7 +201,6 @@ def insert(ctx, files, object, format):
     collection = settings.collection
     if not collection:
         raise ValueError("Collection must be specified.")
-    objects = []
     if not files and not object:
         files = ["-"]
     for file_path in files:
@@ -204,13 +209,19 @@ def insert(ctx, files, object, format):
         else:
             objects = load_objects(file_path)
         logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
-        collection.insert(objects)
+        if replace:
+            collection.replace(objects)
+        else:
+            collection.insert(objects)
         click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
     if object:
         for object_str in object:
             logger.info(f"Parsing: {object_str}")
             objects = yaml.safe_load(object_str)
-            collection.insert(objects)
+            if replace:
+                collection.replace(objects)
+            else:
+                collection.insert(objects)
             click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
     collection.commit()
 
@@ -534,10 +545,12 @@ def detuple(t: Tuple) -> Any:
 @click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
 @click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
 @click.option("--query", "-q", type=click.STRING, help="query term")
+@click.option("--where", "-w", type=click.STRING, help="query term")
 @click.pass_context
 def infer(
     ctx,
     inference_config_file,
+    where,
     query,
     evaluation_count,
     evaluation_match_function,
@@ -579,6 +592,7 @@ def infer(
         linkml-store -i tests/input/iris.csv inference -t sklearn \
            -q '{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}'
     """
+    where_clause = yaml.safe_load(where) if where else None
     if query:
         query_obj = yaml.safe_load(query)
     else:
@@ -681,6 +695,7 @@ def schema(ctx, output_type, output):
 @cli.command()
 @click.argument("search_term")
 @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the search")
+@click.option("--select", "-s", type=click.STRING, help="SELECT clause for the query, as YAML")
 @click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
 @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
@@ -689,13 +704,14 @@ def schema(ctx, output_type, output):
 )
 @index_type_option
 @click.pass_context
-def search(ctx, search_term, where, limit, index_type, output_type, output, auto_index):
+def search(ctx, search_term, where, select, limit, index_type, output_type, output, auto_index):
     """Search objects in the specified collection."""
     collection = ctx.obj["settings"].collection
     ix = get_indexer(index_type)
     logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
     collection.attach_indexer(ix, auto_index=auto_index)
-    result = collection.search(search_term, where=where, limit=limit)
+    select_cols = yaml.safe_load(select) if select else None
+    result = collection.search(search_term, where=where, select_cols=select_cols, limit=limit)
     output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
     if output:
         with open(output, "w") as f:
diff --git a/src/linkml_store/index/implementations/llm_indexer.py b/src/linkml_store/index/implementations/llm_indexer.py
@@ -3,7 +3,6 @@
 from typing import TYPE_CHECKING, List, Optional
 
 import numpy as np
-from tiktoken import encoding_for_model
 
 from linkml_store.api.config import CollectionConfig
 from linkml_store.index.indexer import INDEX_ITEM, Indexer
@@ -55,28 +54,32 @@ def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
 
     def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
         """
-        Use LLM to embed
+        Use LLM to embed.
 
         >>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
         >>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
 
         :param texts:
         :return:
         """
+        from tiktoken import encoding_for_model
         logging.info(f"Converting {len(texts)} texts to vectors")
         model = self.embedding_model
-        token_limit = get_token_limit(model.model_id)
+        # TODO: make this more accurate
+        token_limit = get_token_limit(model.model_id) - 200
         encoding = encoding_for_model("gpt-4o")
 
         def truncate_text(text: str) -> str:
             # split into tokens every 1000 chars:
             parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
-            return render_formatted_text(
+            truncated = render_formatted_text(
                 lambda x: "".join(x),
                 parts,
                 encoding,
                 token_limit,
             )
+            logger.debug(f"Truncated text from {len(text)} to {len(truncated)}")
+            return truncated
 
         texts = [truncate_text(text) for text in texts]
 
diff --git a/src/linkml_store/inference/inference_engine.py b/src/linkml_store/inference/inference_engine.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
-from typing import Optional, TextIO, Tuple, Union
+from typing import Optional, TextIO, Tuple, Union, Any
 
 import pandas as pd
 from pydantic import BaseModel, ConfigDict
@@ -67,13 +67,14 @@ class CollectionSlice(BaseModel):
     # slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
     indices: Optional[Tuple[int, ...]] = None
     _collection: Optional[Collection] = None
+    where: Any = None
 
     @property
     def collection(self) -> Collection:
         if not self._collection and not self.indices:
             return self.base_collection
         if not self._collection:
-            rows = self.base_collection.find({}, limit=-1).rows
+            rows = self.base_collection.rows
             subset = [rows[i] for i in self.indices]
             db = self.base_collection.parent
             subset_name = self.slice_alias
@@ -94,6 +95,7 @@ def as_dataframe(self, flattened=False) -> pd.DataFrame:
         """
         Return the slice of the collection as a dataframe.
 
+        :param flattened: flattned nested objects to give keys like foo.bar
         :return:
         """
         rs = self.collection.find({}, limit=-1)
diff --git a/src/linkml_store/utils/format_utils.py b/src/linkml_store/utils/format_utils.py
diff --git a/src/linkml_store/utils/llm_utils.py b/src/linkml_store/utils/llm_utils.py
diff --git a/tests/test_bigquery.py b/tests/test_bigquery.py

Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ class CollectionConfig(ConfiguredBaseModel):`
`91`	`91`	`)`
`92`	`92`	`source: Optional[CollectionSource] = Field(`
`93`	`93`	`default=None,`
`94`		`- description="Metadata about the source",`
	`94`	`+ description="Source for the collection",`
`95`	`95`	`)`
`96`	`96`	`derived_from: Optional[List[DerivationConfiguration]] = Field(`
`97`	`97`	`default=None,`
`@@ -154,6 +154,10 @@ class DatabaseConfig(ConfiguredBaseModel):`
`154`	`154`	`default=False,`
`155`	`155`	`description="Whether to ensure referential integrity",`
`156`	`156`	`)`
	`157`	`+ source: Optional[CollectionSource] = Field(`
	`158`	`+ default=None,`
	`159`	`+ description="Source for the database",`
	`160`	`+ )`
`157`	`161`
`158`	`162`
`159`	`163`	`class ClientConfig(ConfiguredBaseModel):`