@@ -18,11 +18,18 @@ use super::{meta_page::MetaPage, neighbor_with_distance::NeighborWithDistance};
18
18
pub struct ListSearchNeighbor < PD > {
19
19
pub index_pointer : IndexPointer ,
20
20
distance : f32 ,
21
+ distance_tie_break : usize , /* only used if distance = 0. This ensures a consistent order of results when distance = 0 */
21
22
private_data : PD ,
22
23
}
23
24
24
25
impl < PD > PartialOrd for ListSearchNeighbor < PD > {
25
26
fn partial_cmp ( & self , other : & Self ) -> Option < Ordering > {
27
+ if self . distance == 0.0 && other. distance == 0.0 {
28
+ /* this logic should be consistent with what's used during pruning */
29
+ return self
30
+ . distance_tie_break
31
+ . partial_cmp ( & other. distance_tie_break ) ;
32
+ }
26
33
self . distance . partial_cmp ( & other. distance )
27
34
}
28
35
}
@@ -49,6 +56,7 @@ impl<PD> ListSearchNeighbor<PD> {
49
56
index_pointer,
50
57
private_data,
51
58
distance,
59
+ distance_tie_break : 0 ,
52
60
}
53
61
}
54
62
@@ -62,6 +70,7 @@ pub struct ListSearchResult<QDM, PD> {
62
70
visited : Vec < ListSearchNeighbor < PD > > ,
63
71
inserted : HashSet < ItemPointer > ,
64
72
pub sdm : Option < QDM > ,
73
+ tie_break_item_pointer : Option < ItemPointer > , /* This records the item pointer of the query. It's used for tie-breaking when the distance = 0 */
65
74
pub stats : GreedySearchStats ,
66
75
}
67
76
@@ -72,20 +81,23 @@ impl<QDM, PD> ListSearchResult<QDM, PD> {
72
81
visited : vec ! [ ] ,
73
82
inserted : HashSet :: new ( ) ,
74
83
sdm : None ,
84
+ tie_break_item_pointer : None ,
75
85
stats : GreedySearchStats :: new ( ) ,
76
86
}
77
87
}
78
88
79
89
fn new < S : Storage < QueryDistanceMeasure = QDM , LSNPrivateData = PD > > (
80
90
init_ids : Vec < ItemPointer > ,
81
91
sdm : S :: QueryDistanceMeasure ,
92
+ tie_break_item_pointer : Option < ItemPointer > ,
82
93
search_list_size : usize ,
83
94
meta_page : & MetaPage ,
84
95
gns : & GraphNeighborStore ,
85
96
storage : & S ,
86
97
) -> Self {
87
98
let neigbors = meta_page. get_num_neighbors ( ) as usize ;
88
99
let mut res = Self {
100
+ tie_break_item_pointer,
89
101
candidates : BinaryHeap :: with_capacity ( search_list_size * neigbors) ,
90
102
visited : Vec :: with_capacity ( search_list_size * 2 ) ,
91
103
//candidate_storage: Vec::with_capacity(search_list_size * neigbors),
@@ -107,8 +119,14 @@ impl<QDM, PD> ListSearchResult<QDM, PD> {
107
119
}
108
120
109
121
/// Internal function
110
- pub fn insert_neighbor ( & mut self , n : ListSearchNeighbor < PD > ) {
122
+ pub fn insert_neighbor ( & mut self , mut n : ListSearchNeighbor < PD > ) {
111
123
self . stats . record_candidate ( ) ;
124
+ if n. distance == 0.0 && self . tie_break_item_pointer . is_some ( ) {
125
+ /* record the tie break if distance is 0 */
126
+ let tie_break_item_pointer = self . tie_break_item_pointer . unwrap ( ) ;
127
+ let d = tie_break_item_pointer. ip_distance ( n. index_pointer ) ;
128
+ n. distance_tie_break = d;
129
+ }
112
130
self . candidates . push ( Reverse ( n) ) ;
113
131
}
114
132
@@ -213,7 +231,7 @@ impl<'a> Graph<'a> {
213
231
214
232
let ( pruned, new_neighbors) =
215
233
if candidates. len ( ) > self . neighbor_store . max_neighbors ( self . get_meta_page ( ) ) {
216
- let new_list = self . prune_neighbors ( candidates, storage, stats) ;
234
+ let new_list = self . prune_neighbors ( neighbors_of , candidates, storage, stats) ;
217
235
( true , new_list)
218
236
} else {
219
237
( false , candidates)
@@ -248,6 +266,7 @@ impl<'a> Graph<'a> {
248
266
/// the returned ListSearchResult elements. It shouldn't be used with self.greedy_search_iterate
249
267
fn greedy_search_for_build < S : Storage > (
250
268
& self ,
269
+ index_pointer : IndexPointer ,
251
270
query : PgVector ,
252
271
meta_page : & MetaPage ,
253
272
storage : & S ,
@@ -264,6 +283,7 @@ impl<'a> Graph<'a> {
264
283
let mut l = ListSearchResult :: new (
265
284
init_ids. unwrap ( ) ,
266
285
dm,
286
+ Some ( index_pointer) ,
267
287
search_list_size,
268
288
meta_page,
269
289
self . get_neighbor_store ( ) ,
@@ -293,6 +313,7 @@ impl<'a> Graph<'a> {
293
313
ListSearchResult :: new (
294
314
init_ids. unwrap ( ) ,
295
315
dm,
316
+ None ,
296
317
search_list_size,
297
318
& self . meta_page ,
298
319
self . get_neighbor_store ( ) ,
@@ -331,6 +352,7 @@ impl<'a> Graph<'a> {
331
352
/// if we save the factors or the distances and add incrementally. Not sure.
332
353
pub fn prune_neighbors < S : Storage > (
333
354
& self ,
355
+ neighbors_of : ItemPointer ,
334
356
mut candidates : Vec < NeighborWithDistance > ,
335
357
storage : & S ,
336
358
stats : & mut PruneNeighborStats ,
@@ -419,17 +441,43 @@ impl<'a> Graph<'a> {
419
441
debug_assert ! ( distance_between_candidate_and_existing_neighbor >= 0.0 ) ;
420
442
421
443
//factor is high if the candidate is closer to an existing neighbor than the point it's being considered for
422
- let factor =
423
- if distance_between_candidate_and_existing_neighbor < 0.0 + f32:: EPSILON {
424
- if distance_between_candidate_and_point < 0.0 + f32:: EPSILON {
425
- 1.0
444
+ let factor = if distance_between_candidate_and_existing_neighbor
445
+ < 0.0 + f32:: EPSILON
446
+ {
447
+ if distance_between_candidate_and_point < 0.0 + f32:: EPSILON {
448
+ /* Both distances are 0. This is a special and interesting case because the neighbors of all
449
+ other nodes will be the same on both nodes. This, in turn means, that we would have the same
450
+ nieghbors on both. But, if the num_neighbors is small, we risk creating a unconnected subgraph all
451
+ pointing to each other (all nodes at the same point). For this reason, we create a consistent way
452
+ to rank the distance even at the same point (by using the item-pointer distance), and determine
453
+ the factor according to this new distance. This means that some 0-distance node will be pruned at alpha=1.0
454
+
455
+ Note: with sbq these equivalence relations are actually not uncommon */
456
+ let ip_distance_between_candidate_and_point = candidate_neighbor
457
+ . get_index_pointer_to_neighbor ( )
458
+ . ip_distance ( neighbors_of) ;
459
+
460
+ let ip_distance_between_candidate_and_existing_neighbor =
461
+ candidate_neighbor
462
+ . get_index_pointer_to_neighbor ( )
463
+ . ip_distance ( existing_neighbor. get_index_pointer_to_neighbor ( ) ) ;
464
+
465
+ if ip_distance_between_candidate_and_point
466
+ <= ip_distance_between_candidate_and_existing_neighbor
467
+ {
468
+ 0.99
426
469
} else {
427
- f64:: MAX
470
+ /* make sure this gets pruned for alpha=1.0 to make room for other nodes at higher distances */
471
+ 1.01
428
472
}
429
473
} else {
430
- distance_between_candidate_and_point as f64
431
- / distance_between_candidate_and_existing_neighbor as f64
432
- } ;
474
+ f64:: MAX
475
+ }
476
+ } else {
477
+ distance_between_candidate_and_point as f64
478
+ / distance_between_candidate_and_existing_neighbor as f64
479
+ } ;
480
+
433
481
max_factors[ j] = max_factors[ j] . max ( factor)
434
482
}
435
483
}
@@ -466,8 +514,13 @@ impl<'a> Graph<'a> {
466
514
let meta_page = self . get_meta_page ( ) ;
467
515
468
516
//TODO: make configurable?
469
- let v =
470
- self . greedy_search_for_build ( vec, meta_page, storage, & mut stats. greedy_search_stats ) ;
517
+ let v = self . greedy_search_for_build (
518
+ index_pointer,
519
+ vec,
520
+ meta_page,
521
+ storage,
522
+ & mut stats. greedy_search_stats ,
523
+ ) ;
471
524
472
525
let ( _, neighbor_list) = self . add_neighbors (
473
526
storage,
0 commit comments