Skip to content

Commit 367939b

Browse files
committed
Add node2vec
1 parent 023b8ce commit 367939b

File tree

5 files changed

+1077
-0
lines changed

5 files changed

+1077
-0
lines changed
Lines changed: 396 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,396 @@
1+
from __future__ import annotations
2+
3+
from abc import ABC, abstractmethod
4+
from typing import Any, List, Optional, Union
5+
6+
from pandas import DataFrame
7+
from pydantic import BaseModel, ConfigDict
8+
from pydantic.alias_generators import to_camel
9+
10+
from ...graph.graph_object import Graph
11+
from .estimation_result import EstimationResult
12+
13+
14+
class Node2VecEndpoints(ABC):
15+
"""
16+
Abstract base class defining the API for the Node2Vec algorithm.
17+
"""
18+
19+
@abstractmethod
20+
def mutate(
21+
self,
22+
G: Graph,
23+
mutate_property: str,
24+
iterations: Optional[int] = None,
25+
negative_sampling_rate: Optional[int] = None,
26+
positive_sampling_factor: Optional[float] = None,
27+
embedding_dimension: Optional[int] = None,
28+
embedding_initializer: Optional[Any] = None,
29+
initial_learning_rate: Optional[float] = None,
30+
min_learning_rate: Optional[float] = None,
31+
window_size: Optional[int] = None,
32+
negative_sampling_exponent: Optional[float] = None,
33+
relationship_types: Optional[List[str]] = None,
34+
node_labels: Optional[List[str]] = None,
35+
username: Optional[str] = None,
36+
log_progress: Optional[bool] = None,
37+
sudo: Optional[bool] = None,
38+
concurrency: Optional[Any] = None,
39+
job_id: Optional[Any] = None,
40+
walk_length: Optional[int] = None,
41+
walks_per_node: Optional[int] = None,
42+
in_out_factor: Optional[float] = None,
43+
return_factor: Optional[float] = None,
44+
walk_buffer_size: Optional[int] = None,
45+
relationship_weight_property: Optional[str] = None,
46+
random_seed: Optional[Any] = None,
47+
) -> Node2VecMutateResult:
48+
"""
49+
Executes the Node2Vec algorithm and writes the results back to the graph as a node property.
50+
51+
Parameters
52+
----------
53+
G : Graph
54+
The graph to run the algorithm on
55+
mutate_property : str
56+
The name of the node property to store the embeddings
57+
iterations : Optional[int], default=None
58+
The number of training iterations
59+
negative_sampling_rate : Optional[int], default=None
60+
Number of negative samples for each positive sample
61+
positive_sampling_factor : Optional[float], default=None
62+
Factor to multiply positive sampling weights
63+
embedding_dimension : Optional[int], default=None
64+
The dimension of the generated embeddings
65+
embedding_initializer : Optional[Any], default=None
66+
Strategy for initializing node embeddings
67+
initial_learning_rate : Optional[float], default=None
68+
The initial learning rate
69+
min_learning_rate : Optional[float], default=None
70+
The minimum learning rate
71+
window_size : Optional[int], default=None
72+
Size of the context window
73+
negative_sampling_exponent : Optional[float], default=None
74+
Exponent for negative sampling probability distribution
75+
relationship_types : Optional[List[str]], default=None
76+
The relationship types used to select relationships for this algorithm run
77+
node_labels : Optional[List[str]], default=None
78+
The node labels used to select nodes for this algorithm run
79+
username : Optional[str] = None
80+
The username to attribute the procedure run to
81+
log_progress : Optional[bool], default=None
82+
Whether to log progress
83+
sudo : Optional[bool], default=None
84+
Override memory estimation limits
85+
concurrency : Optional[Any], default=None
86+
The number of concurrent threads
87+
job_id : Optional[Any], default=None
88+
An identifier for the job
89+
walk_length : Optional[int], default=None
90+
The length of each random walk
91+
walks_per_node : Optional[int], default=None
92+
Number of walks to sample for each node
93+
in_out_factor : Optional[float], default=None
94+
Controls the likelihood of immediately revisiting a node in the walk
95+
return_factor : Optional[float], default=None
96+
Controls the likelihood of visiting already visited nodes
97+
walk_buffer_size : Optional[int], default=None
98+
Buffer size for walk sampling
99+
relationship_weight_property : Optional[str], default=None
100+
The property name that contains weight
101+
random_seed : Optional[Any], default=None
102+
Random seed for reproducible results
103+
104+
Returns
105+
-------
106+
Node2VecMutateResult
107+
Algorithm metrics and statistics
108+
"""
109+
110+
@abstractmethod
111+
def stream(
112+
self,
113+
G: Graph,
114+
iterations: Optional[int] = None,
115+
negative_sampling_rate: Optional[int] = None,
116+
positive_sampling_factor: Optional[float] = None,
117+
embedding_dimension: Optional[int] = None,
118+
embedding_initializer: Optional[Any] = None,
119+
initial_learning_rate: Optional[float] = None,
120+
min_learning_rate: Optional[float] = None,
121+
window_size: Optional[int] = None,
122+
negative_sampling_exponent: Optional[float] = None,
123+
relationship_types: Optional[List[str]] = None,
124+
node_labels: Optional[List[str]] = None,
125+
username: Optional[str] = None,
126+
log_progress: Optional[bool] = None,
127+
sudo: Optional[bool] = None,
128+
concurrency: Optional[Any] = None,
129+
job_id: Optional[Any] = None,
130+
walk_length: Optional[int] = None,
131+
walks_per_node: Optional[int] = None,
132+
in_out_factor: Optional[float] = None,
133+
return_factor: Optional[float] = None,
134+
walk_buffer_size: Optional[int] = None,
135+
relationship_weight_property: Optional[str] = None,
136+
random_seed: Optional[Any] = None,
137+
) -> DataFrame:
138+
"""
139+
Executes the Node2Vec algorithm and returns the results as a stream.
140+
141+
Parameters
142+
----------
143+
G : Graph
144+
The graph to run the algorithm on
145+
iterations : Optional[int], default=None
146+
The number of training iterations
147+
negative_sampling_rate : Optional[int], default=None
148+
Number of negative samples for each positive sample
149+
positive_sampling_factor : Optional[float], default=None
150+
Factor to multiply positive sampling weights
151+
embedding_dimension : Optional[int], default=None
152+
The dimension of the generated embeddings
153+
embedding_initializer : Optional[Any], default=None
154+
Strategy for initializing node embeddings
155+
initial_learning_rate : Optional[float], default=None
156+
The initial learning rate
157+
min_learning_rate : Optional[float], default=None
158+
The minimum learning rate
159+
window_size : Optional[int], default=None
160+
Size of the context window
161+
negative_sampling_exponent : Optional[float], default=None
162+
Exponent for negative sampling probability distribution
163+
relationship_types : Optional[List[str]], default=None
164+
The relationship types used to select relationships for this algorithm run
165+
node_labels : Optional[List[str]], default=None
166+
The node labels used to select nodes for this algorithm run
167+
username : Optional[str] = None
168+
The username to attribute the procedure run to
169+
log_progress : Optional[bool], default=None
170+
Whether to log progress
171+
sudo : Optional[bool], default=None
172+
Override memory estimation limits
173+
concurrency : Optional[Any], default=None
174+
The number of concurrent threads
175+
job_id : Optional[Any], default=None
176+
An identifier for the job
177+
walk_length : Optional[int], default=None
178+
The length of each random walk
179+
walks_per_node : Optional[int], default=None
180+
Number of walks to sample for each node
181+
in_out_factor : Optional[float], default=None
182+
Controls the likelihood of immediately revisiting a node in the walk
183+
return_factor : Optional[float], default=None
184+
Controls the likelihood of visiting already visited nodes
185+
walk_buffer_size : Optional[int], default=None
186+
Buffer size for walk sampling
187+
relationship_weight_property : Optional[str], default=None
188+
The property name that contains weight
189+
random_seed : Optional[Any], default=None
190+
Random seed for reproducible results
191+
192+
Returns
193+
-------
194+
DataFrame
195+
Embeddings as a stream with columns nodeId and embedding
196+
"""
197+
198+
@abstractmethod
199+
def write(
200+
self,
201+
G: Graph,
202+
write_property: str,
203+
iterations: Optional[int] = None,
204+
negative_sampling_rate: Optional[int] = None,
205+
positive_sampling_factor: Optional[float] = None,
206+
embedding_dimension: Optional[int] = None,
207+
embedding_initializer: Optional[Any] = None,
208+
initial_learning_rate: Optional[float] = None,
209+
min_learning_rate: Optional[float] = None,
210+
window_size: Optional[int] = None,
211+
negative_sampling_exponent: Optional[float] = None,
212+
relationship_types: Optional[List[str]] = None,
213+
node_labels: Optional[List[str]] = None,
214+
username: Optional[str] = None,
215+
log_progress: Optional[bool] = None,
216+
sudo: Optional[bool] = None,
217+
concurrency: Optional[Any] = None,
218+
job_id: Optional[Any] = None,
219+
walk_length: Optional[int] = None,
220+
walks_per_node: Optional[int] = None,
221+
in_out_factor: Optional[float] = None,
222+
return_factor: Optional[float] = None,
223+
walk_buffer_size: Optional[int] = None,
224+
relationship_weight_property: Optional[str] = None,
225+
random_seed: Optional[Any] = None,
226+
write_concurrency: Optional[Any] = None,
227+
) -> Node2VecWriteResult:
228+
"""
229+
Executes the Node2Vec algorithm and writes the results back to the database.
230+
231+
Parameters
232+
----------
233+
G : Graph
234+
The graph to run the algorithm on
235+
write_property : str
236+
The name of the node property to write the embeddings to
237+
iterations : Optional[int], default=None
238+
The number of training iterations
239+
negative_sampling_rate : Optional[int], default=None
240+
Number of negative samples for each positive sample
241+
positive_sampling_factor : Optional[float], default=None
242+
Factor to multiply positive sampling weights
243+
embedding_dimension : Optional[int], default=None
244+
The dimension of the generated embeddings
245+
embedding_initializer : Optional[Any], default=None
246+
Strategy for initializing node embeddings
247+
initial_learning_rate : Optional[float], default=None
248+
The initial learning rate
249+
min_learning_rate : Optional[float], default=None
250+
The minimum learning rate
251+
window_size : Optional[int], default=None
252+
Size of the context window
253+
negative_sampling_exponent : Optional[float], default=None
254+
Exponent for negative sampling probability distribution
255+
relationship_types : Optional[List[str]], default=None
256+
The relationship types used to select relationships for this algorithm run
257+
node_labels : Optional[List[str]], default=None
258+
The node labels used to select nodes for this algorithm run
259+
username : Optional[str] = None
260+
The username to attribute the procedure run to
261+
log_progress : Optional[bool], default=None
262+
Whether to log progress
263+
sudo : Optional[bool], default=None
264+
Override memory estimation limits
265+
concurrency : Optional[Any], default=None
266+
The number of concurrent threads
267+
job_id : Optional[Any], default=None
268+
An identifier for the job
269+
walk_length : Optional[int], default=None
270+
The length of each random walk
271+
walks_per_node : Optional[int], default=None
272+
Number of walks to sample for each node
273+
in_out_factor : Optional[float], default=None
274+
Controls the likelihood of immediately revisiting a node in the walk
275+
return_factor : Optional[float], default=None
276+
Controls the likelihood of visiting already visited nodes
277+
walk_buffer_size : Optional[int], default=None
278+
Buffer size for walk sampling
279+
relationship_weight_property : Optional[str], default=None
280+
The property name that contains weight
281+
random_seed : Optional[Any], default=None
282+
Random seed for reproducible results
283+
write_concurrency : Optional[Any], default=None
284+
The number of concurrent threads used for writing result
285+
286+
Returns
287+
-------
288+
Node2VecWriteResult
289+
Algorithm metrics and statistics
290+
"""
291+
292+
@abstractmethod
293+
def estimate(
294+
self,
295+
G: Union[Graph, dict[str, Any]],
296+
iterations: Optional[int] = None,
297+
negative_sampling_rate: Optional[int] = None,
298+
positive_sampling_factor: Optional[float] = None,
299+
embedding_dimension: Optional[int] = None,
300+
embedding_initializer: Optional[Any] = None,
301+
initial_learning_rate: Optional[float] = None,
302+
min_learning_rate: Optional[float] = None,
303+
window_size: Optional[int] = None,
304+
negative_sampling_exponent: Optional[float] = None,
305+
relationship_types: Optional[List[str]] = None,
306+
node_labels: Optional[List[str]] = None,
307+
concurrency: Optional[Any] = None,
308+
walk_length: Optional[int] = None,
309+
walks_per_node: Optional[int] = None,
310+
in_out_factor: Optional[float] = None,
311+
return_factor: Optional[float] = None,
312+
walk_buffer_size: Optional[int] = None,
313+
relationship_weight_property: Optional[str] = None,
314+
random_seed: Optional[Any] = None,
315+
) -> EstimationResult:
316+
"""
317+
Returns an estimation of the memory consumption for that procedure.
318+
319+
Parameters
320+
----------
321+
G : Union[Graph, dict[str, Any]]
322+
The graph to run the algorithm on or a dictionary representing the graph.
323+
iterations : Optional[int], default=None
324+
The number of training iterations
325+
negative_sampling_rate : Optional[int], default=None
326+
Number of negative samples for each positive sample
327+
positive_sampling_factor : Optional[float], default=None
328+
Factor to multiply positive sampling weights
329+
embedding_dimension : Optional[int], default=None
330+
The dimension of the generated embeddings
331+
embedding_initializer : Optional[Any], default=None
332+
Strategy for initializing node embeddings
333+
initial_learning_rate : Optional[float], default=None
334+
The initial learning rate
335+
min_learning_rate : Optional[float], default=None
336+
The minimum learning rate
337+
window_size : Optional[int], default=None
338+
Size of the context window
339+
negative_sampling_exponent : Optional[float], default=None
340+
Exponent for negative sampling probability distribution
341+
relationship_types : Optional[List[str]], default=None
342+
The relationship types used to select relationships for this algorithm run
343+
node_labels : Optional[List[str]], default=None
344+
The node labels used to select nodes for this algorithm run
345+
concurrency : Optional[Any], default=None
346+
The number of concurrent threads
347+
walk_length : Optional[int], default=None
348+
The length of each random walk
349+
walks_per_node : Optional[int], default=None
350+
Number of walks to sample for each node
351+
in_out_factor : Optional[float], default=None
352+
Controls the likelihood of immediately revisiting a node in the walk
353+
return_factor : Optional[float], default=None
354+
Controls the likelihood of visiting already visited nodes
355+
walk_buffer_size : Optional[int], default=None
356+
Buffer size for walk sampling
357+
relationship_weight_property : Optional[str], default=None
358+
The property name that contains weight
359+
random_seed : Optional[Any], default=None
360+
Random seed for reproducible results
361+
362+
Returns
363+
-------
364+
EstimationResult
365+
Memory estimation details
366+
"""
367+
368+
369+
class Node2VecMutateResult(BaseModel):
370+
model_config = ConfigDict(alias_generator=to_camel)
371+
372+
node_count: int
373+
node_properties_written: int
374+
pre_processing_millis: int
375+
compute_millis: int
376+
mutate_millis: int
377+
configuration: dict[str, Any]
378+
loss_per_iteration: List[float]
379+
380+
def __getitem__(self, item: str) -> Any:
381+
return self.__dict__[item]
382+
383+
384+
class Node2VecWriteResult(BaseModel):
385+
model_config = ConfigDict(alias_generator=to_camel)
386+
387+
node_count: int
388+
node_properties_written: int
389+
pre_processing_millis: int
390+
compute_millis: int
391+
write_millis: int
392+
configuration: dict[str, Any]
393+
loss_per_iteration: List[float]
394+
395+
def __getitem__(self, item: str) -> Any:
396+
return self.__dict__[item]

0 commit comments

Comments
 (0)