Skip to content

Commit 0872233

Browse files
authored
Merge pull request #15 from linkml/pxf-nb
Documentation
2 parents c2e8617 + 3e6929e commit 0872233

File tree

15 files changed

+1487
-116
lines changed

15 files changed

+1487
-116
lines changed

docs/about.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@ About
66
LinkML-Store is an early effort to provide a unifying storage layer
77
over multiple different backends, unified via LinkML schemas.
88

9+
The overall goals are to provide:
10+
11+
* Make it easier to work with data in different forms (tabular, JSON, columnar, RDF)
12+
* Expressive validation at scale, including full referential integrity validation
13+
* Ability to mix and match different backends (e.g. DuckDB, MongoDB, Solr, ChromaDB, HDF5)
14+
* Composability of different search indexes, including LLM textual embeddings
15+
* LAMP-like stack for LinkML
16+
917
Installation
1018
------------
1119

docs/how-to/Index-caDSR.ipynb

Lines changed: 1026 additions & 0 deletions
Large diffs are not rendered by default.

docs/how-to/Use-MongoDB.ipynb

Lines changed: 113 additions & 62 deletions
Large diffs are not rendered by default.

src/linkml_store/api/client.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,27 @@ class Client:
2727
"""
2828
A client is the top-level object for interacting with databases.
2929
30-
A client has access to one or more :class:`Database` objects.
30+
* A client has access to one or more :class:`.Database` objects.
31+
* Each database consists of a number of :class:`.Collection` objects.
3132
32-
Each database consists of a number of :class:`.Collection` objects.
33-
34-
Examples
35-
--------
33+
Creating a client
34+
-----------------
3635
>>> client = Client()
36+
37+
Attaching a database
38+
--------------------
3739
>>> db = client.attach_database("duckdb", alias="test")
40+
41+
Note that normally a handle would be specified by a locator such as ``duckdb:///<PATH>``, but
42+
for convenience, an in-memory duckdb object can be specified without a full locator
43+
44+
We can check the actual handle:
45+
46+
>>> db.handle
47+
'duckdb:///:memory:'
48+
49+
Creating a new collection
50+
-------------------------
3851
>>> collection = db.create_collection("Person")
3952
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
4053
>>> collection.insert(objs)
@@ -171,6 +184,11 @@ def attach_database(
171184
self._databases = {}
172185
self._databases[alias] = db
173186
db.parent = self
187+
if db.alias:
188+
if db.alias != alias:
189+
raise AssertionError(f"Inconsistent alias: {db.alias} != {alias}")
190+
else:
191+
db.metadata.alias = alias
174192
return db
175193

176194
def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database:

src/linkml_store/api/collection.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from pydantic import BaseModel
1313

1414
from linkml_store.index import get_indexer
15+
from linkml_store.utils.format_utils import load_objects
1516
from linkml_store.utils.object_utils import clean_empties
1617

1718
try:
@@ -69,8 +70,12 @@ def __init__(
6970
self.metadata = metadata
7071
else:
7172
self.metadata = CollectionConfig(name=name, **kwargs)
72-
if name is not None and self.metadata.name is not None and name != self.metadata.name:
73-
raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
73+
if not self.metadata.alias:
74+
self.metadata.alias = name
75+
if not self.metadata.type:
76+
self.metadata.type = name
77+
# if name is not None and self.metadata.name is not None and name != self.metadata.name:
78+
# raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
7479

7580
@property
7681
def name(self) -> str:
@@ -93,7 +98,7 @@ def hidden(self) -> bool:
9398
9499
:return: True if the collection is hidden
95100
"""
96-
return self.metadata.hidden
101+
# return self.metadata.hidden
97102

98103
@property
99104
def target_class_name(self):
@@ -152,6 +157,7 @@ def alias(self):
152157
:return:
153158
"""
154159
# TODO: this is a shim layer until we can normalize on this
160+
# TODO: this is a shim layer until we can normalize on this
155161
if self.metadata.alias:
156162
return self.metadata.alias
157163
return self.name
@@ -444,9 +450,13 @@ def is_internal(self) -> bool:
444450
445451
:return:
446452
"""
447-
if not self.name:
448-
raise ValueError(f"Collection has no name: {self} // {self.metadata}")
449-
return self.name.startswith("internal__")
453+
if not self.alias:
454+
raise ValueError(f"Collection has no alias: {self} // {self.metadata}")
455+
return self.alias.startswith("internal__")
456+
457+
def load_from_source(self):
458+
objects = load_objects(self.metadata.source_location)
459+
self.insert(objects)
450460

451461
def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
452462
"""
@@ -599,6 +609,8 @@ def induce_class_definition_from_objects(self, objs: List[OBJECT], max_sample_si
599609
:param max_sample_size:
600610
:return:
601611
"""
612+
if not self.target_class_name:
613+
raise ValueError(f"No target_class_name for {self.alias}")
602614
cd = ClassDefinition(self.target_class_name)
603615
keys = defaultdict(list)
604616
for obj in objs[0:max_sample_size]:

src/linkml_store/api/config.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class CollectionConfig(BaseModel):
1616
default=None,
1717
description="The type of object in the collection. TODO; use this instead of name",
1818
)
19-
metadata: Optional[Dict] = Field(
19+
additional_properties: Optional[Dict] = Field(
2020
default=None,
2121
description="Optional metadata for the collection",
2222
)
@@ -36,6 +36,10 @@ class CollectionConfig(BaseModel):
3636
default=False,
3737
description="Whether the collection is prepopulated",
3838
)
39+
source_location: Optional[str] = Field(
40+
default=None,
41+
description="Filesystem or remote URL that stores the data",
42+
)
3943

4044

4145
class DatabaseConfig(BaseModel):
@@ -55,7 +59,7 @@ class DatabaseConfig(BaseModel):
5559
default=None,
5660
description="The LinkML schema as a dictionary",
5761
)
58-
collections: Dict[str, CollectionConfig] = Field(
62+
collections: Optional[Dict[str, CollectionConfig]] = Field(
5963
default={},
6064
description="A dictionary of collection configurations",
6165
)

src/linkml_store/api/database.py

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,33 @@ class Database(ABC):
2929
"""
3030
A Database provides access to named collections of data.
3131
32-
Examples
33-
--------
32+
A database object is owned by a :ref:`Client`. The database
33+
object uses a :ref:`handle` to know what kind of external
34+
dataase system to connect to (e.g. duckdb, mongodb). The handle
35+
is a string ``<DatabaseType>:<LocalLocator>``
36+
37+
The
38+
database object may also have an :ref:`alias` that is mapped
39+
to the handle.
40+
41+
Attaching a database
42+
--------------------
3443
>>> from linkml_store.api.client import Client
3544
>>> client = Client()
36-
>>> db = client.attach_database("duckdb", alias="test")
45+
>>> db = client.attach_database("duckdb:///:memory:", alias="test")
46+
47+
We can check the value of the handle:
48+
3749
>>> db.handle
3850
'duckdb:///:memory:'
51+
52+
The alias can be used to retrieve the database object from the client
53+
54+
>>> assert db == client.get_database("test")
55+
56+
Creating a collection
57+
---------------------
58+
3959
>>> collection = db.create_collection("Person")
4060
>>> len(db.list_collections())
4161
1
@@ -108,6 +128,8 @@ def from_config(self, db_config: DatabaseConfig, **kwargs):
108128
return self
109129

110130
def _initialize_collections(self):
131+
if not self.metadata.collections:
132+
return
111133
for name, collection_config in self.metadata.collections.items():
112134
alias = collection_config.alias
113135
typ = collection_config.type
@@ -156,6 +178,10 @@ def handle(self) -> str:
156178
"""
157179
return self.metadata.handle
158180

181+
@property
182+
def alias(self):
183+
return self.metadata.alias
184+
159185
def store(self, obj: Dict[str, Any], **kwargs):
160186
"""
161187
Store an object in the database.
@@ -193,9 +219,11 @@ def store(self, obj: Dict[str, Any], **kwargs):
193219
if not v:
194220
continue
195221
if slot:
196-
collection = self.get_collection(slot.range, create_if_not_exists=True)
222+
logger.debug(f"Aligning to existing slot: {slot.name} range={slot.range}")
223+
collection = self.get_collection(slot.name, type=slot.range, create_if_not_exists=True)
197224
else:
198225
collection = self.get_collection(k, create_if_not_exists=True)
226+
logger.debug(f"Replacing using {collection.alias} {collection.target_class_name}")
199227
collection.replace(v)
200228

201229
def commit(self, **kwargs):
@@ -260,6 +288,8 @@ def create_collection(
260288
raise ValueError(f"Collection name must be provided: alias: {alias} metadata: {metadata}")
261289
collection_cls = self.collection_class
262290
collection = collection_cls(name=name, alias=alias, parent=self, metadata=metadata)
291+
if metadata and metadata.source_location:
292+
collection.load_from_source()
263293
if metadata and metadata.attributes:
264294
sv = self.schema_view
265295
schema = sv.schema
@@ -318,7 +348,9 @@ def list_collection_names(self, **kwargs) -> Sequence[str]:
318348
"""
319349
return [c.name for c in self.list_collections(**kwargs)]
320350

321-
def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Collection":
351+
def get_collection(
352+
self, name: str, type: Optional[str] = None, create_if_not_exists=True, **kwargs
353+
) -> "Collection":
322354
"""
323355
Get a named collection.
324356
@@ -336,14 +368,19 @@ def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Col
336368
KeyError: 'Collection NonExistent does not exist'
337369
338370
:param name: name of the collection
371+
:param type: target class name
339372
:param create_if_not_exists: create the collection if it does not exist
340373
341374
"""
342375
if not self._collections:
376+
logger.debug("Initializing collections")
343377
self.init_collections()
344378
if name not in self._collections.keys():
345379
if create_if_not_exists:
346-
self._collections[name] = self.create_collection(name)
380+
if type is None:
381+
type = name
382+
logger.debug(f"Creating new collection: {name} kwargs: {kwargs}")
383+
self._collections[name] = self.create_collection(type, alias=name, **kwargs)
347384
else:
348385
raise KeyError(f"Collection {name} does not exist")
349386
return self._collections[name]
@@ -470,8 +507,7 @@ def set_schema_view(self, schema_view: Union[str, Path, SchemaView]):
470507
if inlined and slot.range:
471508
if slot.name in self._collections:
472509
coll = self._collections[slot.name]
473-
if not coll.metadata.type:
474-
coll.metadata.type = slot.range
510+
coll.metadata.type = slot.range
475511

476512
def load_schema_view(self, path: Union[str, Path]):
477513
"""
@@ -538,7 +574,7 @@ def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
538574
>>> db = client.attach_database("duckdb", alias="test")
539575
>>> db.load_schema_view("tests/input/countries/countries.linkml.yaml")
540576
541-
Let's introspet the schema to see what slots are applicable for the class "Country":
577+
Let's introspect the schema to see what slots are applicable for the class "Country":
542578
543579
>>> sv = db.schema_view
544580
>>> for slot in sv.class_induced_slots("Country"):

src/linkml_store/api/stores/duckdb/duckdb_collection.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@ class DuckDBCollection(Collection):
1919
_table_created: bool = None
2020

2121
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
22+
logger.debug(f"Inserting {len(objs)}")
2223
if not isinstance(objs, list):
2324
objs = [objs]
2425
if not objs:
2526
return
2627
cd = self.class_definition()
2728
if not cd:
29+
logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}; will induce")
2830
cd = self.induce_class_definition_from_objects(objs)
2931
self._create_table(cd)
3032
table = self._sqla_table(cd)

src/linkml_store/api/stores/duckdb/duckdb_database.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,10 @@ def query(self, query: Query, **kwargs) -> QueryResult:
116116

117117
def init_collections(self):
118118
# TODO: unify schema introspection
119-
schema = introspect_schema(self.engine)
119+
if not self.schema_view:
120+
schema = introspect_schema(self.engine)
121+
else:
122+
schema = self.schema_view.schema
120123
table_names = schema.classes.keys()
121124
if self._collections is None:
122125
self._collections = {}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""
2+
Adapter for DuckDB embedded database.
3+
4+
Handles have the form:
5+
6+
- ``duckdb:///<path>`` for a file-based database
7+
- ``duckdb:///:memory:`` for an in-memory database
8+
"""
9+
10+
from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
11+
from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
12+
13+
__all__ = [
14+
"DuckDBCollection",
15+
"DuckDBDatabase",
16+
]

0 commit comments

Comments
 (0)