Skip to content

Commit 219336c

Browse files
committed
made dependencies optional
1 parent 8b9cfb8 commit 219336c

File tree

13 files changed

+2861
-2440
lines changed

13 files changed

+2861
-2440
lines changed

Makefile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ pytest-neo4j:
1818
pytest-core:
1919
$(RUN) pytest
2020

21+
pytest-minimal:
22+
$(RUN) pytest tests/test_api/test_filesystem_adapter.py
23+
2124
pytest-full:
2225
$(RUN) pytest -m ""
2326

@@ -28,7 +31,8 @@ all-pytest:
2831
$(RUN) pytest -m "integration or not integration"
2932

3033
install-all:
31-
poetry install -E analytics -E app -E tests -E llm -E mongodb
34+
poetry install --all-extras
35+
# poetry install -E analytics -E app -E tests -E llm -E mongodb
3236

3337
DOCTEST_DIR = docs src/linkml_store/api src/linkml_store/index src/linkml_store/utils
3438
doctest:

poetry.lock

Lines changed: 2691 additions & 2403 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ pydantic = "^2.0.0"
1313
linkml-runtime = ">=1.8.0"
1414
streamlit = { version = "^1.32.2", optional = true }
1515
sqlalchemy = "*"
16+
google-cloud-bigquery = "*"
1617
duckdb = ">=0.10.1"
1718
duckdb-engine = ">=0.11.2"
1819
matplotlib = { version = "*", optional = true }
@@ -39,11 +40,14 @@ frictionless = { version="*", optional = true }
3940
ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
4041
gcsfs = { version="*", optional = true }
4142
multipledispatch = { version="*" }
43+
tabulate = "*"
4244
pandas = ">=2.2.1"
4345
jinja2 = "^3.1.4"
4446
jsonlines = "^4.0.0"
4547
fastapi = { version="*", optional = true }
4648
uvicorn = { version="*", optional = true }
49+
xmltodict = "^0.13.0"
50+
jsonpatch = "^1.33"
4751

4852
[tool.poetry.group.dev.dependencies]
4953
pytest = {version = ">=7.1.2"}
@@ -87,6 +91,8 @@ fastapi = ["fastapi", "uvicorn"]
8791
frictionless = ["frictionless"]
8892
scipy = ["scipy", "scikit-learn"]
8993
ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
94+
bigquery = ["google-cloud-bigquery"]
95+
all = ["llm", "mongodb", "neo4j", "chromadb", "validation", "map", "renderer", "ibis", "bigquery"]
9096

9197
[tool.poetry.scripts]
9298
linkml-store = "linkml_store.cli:cli"

src/linkml_store/api/client.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import importlib
12
import logging
23
from pathlib import Path
34
from typing import Dict, Optional, Union
@@ -7,23 +8,18 @@
78

89
from linkml_store.api import Database
910
from linkml_store.api.config import ClientConfig
10-
from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
11-
from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
12-
from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
13-
from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
14-
from linkml_store.api.stores.neo4j.neo4j_database import Neo4jDatabase
15-
from linkml_store.api.stores.solr.solr_database import SolrDatabase
1611

1712
logger = logging.getLogger(__name__)
1813

1914

15+
2016
HANDLE_MAP = {
21-
"duckdb": DuckDBDatabase,
22-
"solr": SolrDatabase,
23-
"mongodb": MongoDBDatabase,
24-
"chromadb": ChromaDBDatabase,
25-
"neo4j": Neo4jDatabase,
26-
"file": FileSystemDatabase,
17+
"duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
18+
"solr": "linkml_store.api.stores.solr.solr_database.SolrDatabase",
19+
"mongodb": "linkml_store.api.stores.mongodb.mongodb_database.MongoDBDatabase",
20+
"chromadb": "linkml_store.api.stores.chromadb.chromadb_database.ChromaDBDatabase",
21+
"neo4j": "linkml_store.api.stores.neo4j.neo4j_database.Neo4jDatabase",
22+
"file": "linkml_store.api.stores.filesystem.filesystem_database.FileSystemDatabase",
2723
}
2824

2925

@@ -155,6 +151,9 @@ def _initialize_databases(self, auto_attach=False, **kwargs):
155151
if auto_attach:
156152
db = self.attach_database(handle, alias=name, **kwargs)
157153
db.from_config(db_config)
154+
if db_config.source:
155+
db = self.get_database(name)
156+
db.store(db_config.source.data)
158157

159158
def _set_database_config(self, db: Database):
160159
"""
@@ -207,7 +206,14 @@ def attach_database(
207206
scheme, _ = handle.split(":", 1)
208207
if scheme not in HANDLE_MAP:
209208
raise ValueError(f"Unknown scheme: {scheme}")
210-
cls = HANDLE_MAP[scheme]
209+
module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
210+
try:
211+
module = importlib.import_module(module_path)
212+
cls = getattr(module, class_name)
213+
except ImportError as e:
214+
raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")
215+
216+
#cls = HANDLE_MAP[scheme]
211217
db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
212218
if schema_view:
213219
db.set_schema_view(schema_view)

src/linkml_store/api/collection.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,7 @@ def search(
470470
where: Optional[Any] = None,
471471
index_name: Optional[str] = None,
472472
limit: Optional[int] = None,
473+
select_cols: Optional[List[str]] = None,
473474
mmr_relevance_factor: Optional[float] = None,
474475
**kwargs,
475476
) -> QueryResult:
@@ -503,6 +504,7 @@ def search(
503504
:param where:
504505
:param index_name:
505506
:param limit:
507+
:param select_cols:
506508
:param kwargs:
507509
:return:
508510
"""
@@ -538,6 +540,11 @@ def search(
538540
results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
539541
for r in results:
540542
del r[1][index_col]
543+
if select_cols:
544+
new_results = []
545+
for r in results:
546+
new_results.append((r[0], {k: v for k, v in r[1].items() if k in select_cols}))
547+
results = new_results
541548
new_qr = QueryResult(num_rows=len(results))
542549
new_qr.ranked_rows = results
543550
new_qr.rows = [r[1] for r in results]
@@ -672,6 +679,7 @@ def rows_iter(self) -> Iterable[OBJECT]:
672679
"""
673680
yield from self.find({}, limit=-1).rows
674681

682+
@property
675683
def rows(self) -> List[OBJECT]:
676684
"""
677685
Return a list of objects in the collection.

src/linkml_store/api/config.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ class CollectionConfig(ConfiguredBaseModel):
9191
)
9292
source: Optional[CollectionSource] = Field(
9393
default=None,
94-
description="Metadata about the source",
94+
description="Source for the collection",
9595
)
9696
derived_from: Optional[List[DerivationConfiguration]] = Field(
9797
default=None,
@@ -154,6 +154,10 @@ class DatabaseConfig(ConfiguredBaseModel):
154154
default=False,
155155
description="Whether to ensure referential integrity",
156156
)
157+
source: Optional[CollectionSource] = Field(
158+
default=None,
159+
description="Source for the database",
160+
)
157161

158162

159163
class ClientConfig(ConfiguredBaseModel):

src/linkml_store/api/stores/filesystem/filesystem_database.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import Optional
44

55
import yaml
6-
from linkml.utils.schema_builder import SchemaBuilder
6+
from linkml_runtime.utils.schema_builder import SchemaBuilder
77
from linkml_runtime import SchemaView
88

99
from linkml_store.api import Database

src/linkml_store/cli.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -135,12 +135,17 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
135135
logger.setLevel(logging.ERROR)
136136
ctx.ensure_object(dict)
137137
if input:
138-
stem = underscore(Path(input).stem)
139-
database = "duckdb"
140-
collection = stem
138+
database = "duckdb" # default: store in duckdb
139+
if input.startswith("http"):
140+
parts = input.split("/")
141+
collection = parts[-1]
142+
collection = collection.split(".")[0]
143+
else:
144+
stem = underscore(Path(input).stem)
145+
collection = stem
146+
logger.info(f"Using input file: {input}, "
147+
f"default storage is {database} and collection is {collection}")
141148
config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
142-
# collection = Path(input).stem
143-
# database = f"file:{Path(input).parent}"
144149
if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
145150
config = DEFAULT_LOCAL_CONF_PATH
146151
if config is None and DEFAULT_GLOBAL_CONF_PATH.exists():
@@ -178,10 +183,11 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
178183

179184
@cli.command()
180185
@click.argument("files", type=click.Path(exists=True), nargs=-1)
186+
@click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
181187
@click.option("--format", "-f", type=format_choice, help="Input format")
182188
@click.option("--object", "-i", multiple=True, help="Input object as YAML")
183189
@click.pass_context
184-
def insert(ctx, files, object, format):
190+
def insert(ctx, files, replace, object, format):
185191
"""Insert objects from files (JSON, YAML, TSV) into the specified collection.
186192
187193
Using a configuration:
@@ -195,7 +201,6 @@ def insert(ctx, files, object, format):
195201
collection = settings.collection
196202
if not collection:
197203
raise ValueError("Collection must be specified.")
198-
objects = []
199204
if not files and not object:
200205
files = ["-"]
201206
for file_path in files:
@@ -204,13 +209,19 @@ def insert(ctx, files, object, format):
204209
else:
205210
objects = load_objects(file_path)
206211
logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
207-
collection.insert(objects)
212+
if replace:
213+
collection.replace(objects)
214+
else:
215+
collection.insert(objects)
208216
click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
209217
if object:
210218
for object_str in object:
211219
logger.info(f"Parsing: {object_str}")
212220
objects = yaml.safe_load(object_str)
213-
collection.insert(objects)
221+
if replace:
222+
collection.replace(objects)
223+
else:
224+
collection.insert(objects)
214225
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
215226
collection.commit()
216227

@@ -534,10 +545,12 @@ def detuple(t: Tuple) -> Any:
534545
@click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
535546
@click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
536547
@click.option("--query", "-q", type=click.STRING, help="query term")
548+
@click.option("--where", "-w", type=click.STRING, help="query term")
537549
@click.pass_context
538550
def infer(
539551
ctx,
540552
inference_config_file,
553+
where,
541554
query,
542555
evaluation_count,
543556
evaluation_match_function,
@@ -579,6 +592,7 @@ def infer(
579592
linkml-store -i tests/input/iris.csv inference -t sklearn \
580593
-q '{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}'
581594
"""
595+
where_clause = yaml.safe_load(where) if where else None
582596
if query:
583597
query_obj = yaml.safe_load(query)
584598
else:
@@ -681,6 +695,7 @@ def schema(ctx, output_type, output):
681695
@cli.command()
682696
@click.argument("search_term")
683697
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the search")
698+
@click.option("--select", "-s", type=click.STRING, help="SELECT clause for the query, as YAML")
684699
@click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
685700
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
686701
@click.option("--output", "-o", type=click.Path(), help="Output file path")
@@ -689,13 +704,14 @@ def schema(ctx, output_type, output):
689704
)
690705
@index_type_option
691706
@click.pass_context
692-
def search(ctx, search_term, where, limit, index_type, output_type, output, auto_index):
707+
def search(ctx, search_term, where, select, limit, index_type, output_type, output, auto_index):
693708
"""Search objects in the specified collection."""
694709
collection = ctx.obj["settings"].collection
695710
ix = get_indexer(index_type)
696711
logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
697712
collection.attach_indexer(ix, auto_index=auto_index)
698-
result = collection.search(search_term, where=where, limit=limit)
713+
select_cols = yaml.safe_load(select) if select else None
714+
result = collection.search(search_term, where=where, select_cols=select_cols, limit=limit)
699715
output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
700716
if output:
701717
with open(output, "w") as f:

src/linkml_store/index/implementations/llm_indexer.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from typing import TYPE_CHECKING, List, Optional
44

55
import numpy as np
6-
from tiktoken import encoding_for_model
76

87
from linkml_store.api.config import CollectionConfig
98
from linkml_store.index.indexer import INDEX_ITEM, Indexer
@@ -55,28 +54,32 @@ def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
5554

5655
def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
5756
"""
58-
Use LLM to embed
57+
Use LLM to embed.
5958
6059
>>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
6160
>>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
6261
6362
:param texts:
6463
:return:
6564
"""
65+
from tiktoken import encoding_for_model
6666
logging.info(f"Converting {len(texts)} texts to vectors")
6767
model = self.embedding_model
68-
token_limit = get_token_limit(model.model_id)
68+
# TODO: make this more accurate
69+
token_limit = get_token_limit(model.model_id) - 200
6970
encoding = encoding_for_model("gpt-4o")
7071

7172
def truncate_text(text: str) -> str:
7273
# split into tokens every 1000 chars:
7374
parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
74-
return render_formatted_text(
75+
truncated = render_formatted_text(
7576
lambda x: "".join(x),
7677
parts,
7778
encoding,
7879
token_limit,
7980
)
81+
logger.debug(f"Truncated text from {len(text)} to {len(truncated)}")
82+
return truncated
8083

8184
texts = [truncate_text(text) for text in texts]
8285

src/linkml_store/inference/inference_engine.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from dataclasses import dataclass
55
from enum import Enum
66
from pathlib import Path
7-
from typing import Optional, TextIO, Tuple, Union
7+
from typing import Optional, TextIO, Tuple, Union, Any
88

99
import pandas as pd
1010
from pydantic import BaseModel, ConfigDict
@@ -67,13 +67,14 @@ class CollectionSlice(BaseModel):
6767
# slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
6868
indices: Optional[Tuple[int, ...]] = None
6969
_collection: Optional[Collection] = None
70+
where: Any = None
7071

7172
@property
7273
def collection(self) -> Collection:
7374
if not self._collection and not self.indices:
7475
return self.base_collection
7576
if not self._collection:
76-
rows = self.base_collection.find({}, limit=-1).rows
77+
rows = self.base_collection.rows
7778
subset = [rows[i] for i in self.indices]
7879
db = self.base_collection.parent
7980
subset_name = self.slice_alias
@@ -94,6 +95,7 @@ def as_dataframe(self, flattened=False) -> pd.DataFrame:
9495
"""
9596
Return the slice of the collection as a dataframe.
9697
98+
:param flattened: flattned nested objects to give keys like foo.bar
9799
:return:
98100
"""
99101
rs = self.collection.find({}, limit=-1)

0 commit comments

Comments
 (0)