Skip to content

Commit 985cb0a

Browse files
committed
Fix handling of many graph nodes at the same location
When nodes have the same location, their distance to each other is 0. And, their distance to all their neighbors is the same. In the case with few maximum neighbors this can cause problems because the nodes end up linking to each other and getting cut off from the rest of the graph. We fix this by introducing a secondary "tie break" distance between nodes at the same location (based on on-disk distance). That's used to create a secondary ordering for equivalent nodes and allows the prune function to then properly cut away equivalent nodes so that your neighbor list doesn't get full of them. Note with SBQ, equivalence classes are actually not /so/ rare especially with low dimension counts.
1 parent 4f46878 commit 985cb0a

File tree

3 files changed

+170
-12
lines changed

3 files changed

+170
-12
lines changed

pgvectorscale/src/access_method/build.rs

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ fn finalize_index_build<S: Storage>(
284284
if neighbors.len() > state.graph.get_meta_page().get_num_neighbors() as _ {
285285
//OPT: get rid of this clone
286286
prune_neighbors = state.graph.prune_neighbors(
287+
index_pointer,
287288
neighbors.clone(),
288289
storage,
289290
&mut write_stats.prune_stats,
@@ -835,4 +836,101 @@ pub mod tests {
835836

836837
Ok(())
837838
}
839+
840+
#[pg_test]
841+
pub unsafe fn test_index_small_accuracy() -> spi::Result<()> {
842+
/* Test for the creation of connected graphs when the number of dimensions is small as is the
843+
number of neighborss */
844+
/* small num_neighbors is especially challenging for making sure no nodes get disconnected */
845+
let index_options = "num_neighbors=10, search_list_size=10";
846+
let expected_cnt = 300;
847+
let dimensions = 2;
848+
849+
Spi::run(&format!(
850+
"CREATE TABLE test_data (
851+
id int,
852+
embedding vector ({dimensions})
853+
);
854+
855+
select setseed(0.5);
856+
-- generate 300 vectors
857+
INSERT INTO test_data (id, embedding)
858+
SELECT
859+
*
860+
FROM (
861+
SELECT
862+
i % {expected_cnt},
863+
('[' || array_to_string(array_agg(random()), ',', '0') || ']')::vector AS embedding
864+
FROM
865+
generate_series(1, {dimensions} * {expected_cnt}) i
866+
GROUP BY
867+
i % {expected_cnt}) g;
868+
869+
CREATE INDEX idx_diskann_bq ON test_data USING diskann (embedding) WITH ({index_options});
870+
871+
872+
SET enable_seqscan = 0;
873+
-- perform index scans on the vectors
874+
SELECT
875+
*
876+
FROM
877+
test_data
878+
ORDER BY
879+
embedding <=> (
880+
SELECT
881+
('[' || array_to_string(array_agg(random()), ',', '0') || ']')::vector AS embedding
882+
FROM generate_series(1, {dimensions}));"))?;
883+
884+
let test_vec: Option<Vec<f32>> = Spi::get_one(&format!(
885+
"SELECT('{{' || array_to_string(array_agg(1.0), ',', '0') || '}}')::real[] AS embedding
886+
FROM generate_series(1, {dimensions})"
887+
))?;
888+
889+
let cnt: Option<i64> = Spi::get_one_with_args(
890+
&format!(
891+
"
892+
SET enable_seqscan = 0;
893+
SET enable_indexscan = 1;
894+
SET diskann.query_search_list_size = 2;
895+
WITH cte as (select * from test_data order by embedding <=> $1::vector) SELECT count(*) from cte;
896+
",
897+
),
898+
vec![(
899+
pgrx::PgOid::Custom(pgrx::pg_sys::FLOAT4ARRAYOID),
900+
test_vec.clone().into_datum(),
901+
)],
902+
)?;
903+
904+
if cnt.unwrap() != expected_cnt {
905+
/* better debugging */
906+
let id: Option<i64> = Spi::get_one_with_args(
907+
&format!(
908+
"
909+
SET enable_seqscan = 0;
910+
SET enable_indexscan = 1;
911+
SET diskann.query_search_list_size = 2;
912+
WITH cte as (select id from test_data EXCEPT (select id from test_data order by embedding <=> $1::vector)) SELECT id from cte limit 1;
913+
",
914+
),
915+
vec![(
916+
pgrx::PgOid::Custom(pgrx::pg_sys::FLOAT4ARRAYOID),
917+
test_vec.clone().into_datum(),
918+
)],
919+
)?;
920+
921+
assert!(
922+
cnt.unwrap() == expected_cnt,
923+
"initial count is {} id is {}",
924+
cnt.unwrap(),
925+
id.unwrap()
926+
);
927+
}
928+
929+
assert!(
930+
cnt.unwrap() == expected_cnt,
931+
"initial count is {}",
932+
cnt.unwrap()
933+
);
934+
Ok(())
935+
}
838936
}

pgvectorscale/src/access_method/graph.rs

Lines changed: 65 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,18 @@ use super::{meta_page::MetaPage, neighbor_with_distance::NeighborWithDistance};
1818
pub struct ListSearchNeighbor<PD> {
1919
pub index_pointer: IndexPointer,
2020
distance: f32,
21+
distance_tie_break: usize, /* only used if distance = 0. This ensures a consistent order of results when distance = 0 */
2122
private_data: PD,
2223
}
2324

2425
impl<PD> PartialOrd for ListSearchNeighbor<PD> {
2526
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
27+
if self.distance == 0.0 && other.distance == 0.0 {
28+
/* this logic should be consistent with what's used during pruning */
29+
return self
30+
.distance_tie_break
31+
.partial_cmp(&other.distance_tie_break);
32+
}
2633
self.distance.partial_cmp(&other.distance)
2734
}
2835
}
@@ -49,6 +56,7 @@ impl<PD> ListSearchNeighbor<PD> {
4956
index_pointer,
5057
private_data,
5158
distance,
59+
distance_tie_break: 0,
5260
}
5361
}
5462

@@ -62,6 +70,7 @@ pub struct ListSearchResult<QDM, PD> {
6270
visited: Vec<ListSearchNeighbor<PD>>,
6371
inserted: HashSet<ItemPointer>,
6472
pub sdm: Option<QDM>,
73+
tie_break_item_pointer: Option<ItemPointer>, /* This records the item pointer of the query. It's used for tie-breaking when the distance = 0 */
6574
pub stats: GreedySearchStats,
6675
}
6776

@@ -72,20 +81,23 @@ impl<QDM, PD> ListSearchResult<QDM, PD> {
7281
visited: vec![],
7382
inserted: HashSet::new(),
7483
sdm: None,
84+
tie_break_item_pointer: None,
7585
stats: GreedySearchStats::new(),
7686
}
7787
}
7888

7989
fn new<S: Storage<QueryDistanceMeasure = QDM, LSNPrivateData = PD>>(
8090
init_ids: Vec<ItemPointer>,
8191
sdm: S::QueryDistanceMeasure,
92+
tie_break_item_pointer: Option<ItemPointer>,
8293
search_list_size: usize,
8394
meta_page: &MetaPage,
8495
gns: &GraphNeighborStore,
8596
storage: &S,
8697
) -> Self {
8798
let neigbors = meta_page.get_num_neighbors() as usize;
8899
let mut res = Self {
100+
tie_break_item_pointer,
89101
candidates: BinaryHeap::with_capacity(search_list_size * neigbors),
90102
visited: Vec::with_capacity(search_list_size * 2),
91103
//candidate_storage: Vec::with_capacity(search_list_size * neigbors),
@@ -107,8 +119,14 @@ impl<QDM, PD> ListSearchResult<QDM, PD> {
107119
}
108120

109121
/// Internal function
110-
pub fn insert_neighbor(&mut self, n: ListSearchNeighbor<PD>) {
122+
pub fn insert_neighbor(&mut self, mut n: ListSearchNeighbor<PD>) {
111123
self.stats.record_candidate();
124+
if n.distance == 0.0 && self.tie_break_item_pointer.is_some() {
125+
/* record the tie break if distance is 0 */
126+
let tie_break_item_pointer = self.tie_break_item_pointer.unwrap();
127+
let d = tie_break_item_pointer.ip_distance(n.index_pointer);
128+
n.distance_tie_break = d;
129+
}
112130
self.candidates.push(Reverse(n));
113131
}
114132

@@ -213,7 +231,7 @@ impl<'a> Graph<'a> {
213231

214232
let (pruned, new_neighbors) =
215233
if candidates.len() > self.neighbor_store.max_neighbors(self.get_meta_page()) {
216-
let new_list = self.prune_neighbors(candidates, storage, stats);
234+
let new_list = self.prune_neighbors(neighbors_of, candidates, storage, stats);
217235
(true, new_list)
218236
} else {
219237
(false, candidates)
@@ -248,6 +266,7 @@ impl<'a> Graph<'a> {
248266
/// the returned ListSearchResult elements. It shouldn't be used with self.greedy_search_iterate
249267
fn greedy_search_for_build<S: Storage>(
250268
&self,
269+
index_pointer: IndexPointer,
251270
query: PgVector,
252271
meta_page: &MetaPage,
253272
storage: &S,
@@ -264,6 +283,7 @@ impl<'a> Graph<'a> {
264283
let mut l = ListSearchResult::new(
265284
init_ids.unwrap(),
266285
dm,
286+
Some(index_pointer),
267287
search_list_size,
268288
meta_page,
269289
self.get_neighbor_store(),
@@ -293,6 +313,7 @@ impl<'a> Graph<'a> {
293313
ListSearchResult::new(
294314
init_ids.unwrap(),
295315
dm,
316+
None,
296317
search_list_size,
297318
&self.meta_page,
298319
self.get_neighbor_store(),
@@ -331,6 +352,7 @@ impl<'a> Graph<'a> {
331352
/// if we save the factors or the distances and add incrementally. Not sure.
332353
pub fn prune_neighbors<S: Storage>(
333354
&self,
355+
neighbors_of: ItemPointer,
334356
mut candidates: Vec<NeighborWithDistance>,
335357
storage: &S,
336358
stats: &mut PruneNeighborStats,
@@ -419,17 +441,43 @@ impl<'a> Graph<'a> {
419441
debug_assert!(distance_between_candidate_and_existing_neighbor >= 0.0);
420442

421443
//factor is high if the candidate is closer to an existing neighbor than the point it's being considered for
422-
let factor =
423-
if distance_between_candidate_and_existing_neighbor < 0.0 + f32::EPSILON {
424-
if distance_between_candidate_and_point < 0.0 + f32::EPSILON {
425-
1.0
444+
let factor = if distance_between_candidate_and_existing_neighbor
445+
< 0.0 + f32::EPSILON
446+
{
447+
if distance_between_candidate_and_point < 0.0 + f32::EPSILON {
448+
/* Both distances are 0. This is a special and interesting case because the neighbors of all
449+
other nodes will be the same on both nodes. This, in turn means, that we would have the same
450+
nieghbors on both. But, if the num_neighbors is small, we risk creating a unconnected subgraph all
451+
pointing to each other (all nodes at the same point). For this reason, we create a consistent way
452+
to rank the distance even at the same point (by using the item-pointer distance), and determine
453+
the factor according to this new distance. This means that some 0-distance node will be pruned at alpha=1.0
454+
455+
Note: with sbq these equivalence relations are actually not uncommon */
456+
let ip_distance_between_candidate_and_point = candidate_neighbor
457+
.get_index_pointer_to_neighbor()
458+
.ip_distance(neighbors_of);
459+
460+
let ip_distance_between_candidate_and_existing_neighbor =
461+
candidate_neighbor
462+
.get_index_pointer_to_neighbor()
463+
.ip_distance(existing_neighbor.get_index_pointer_to_neighbor());
464+
465+
if ip_distance_between_candidate_and_point
466+
<= ip_distance_between_candidate_and_existing_neighbor
467+
{
468+
0.99
426469
} else {
427-
f64::MAX
470+
/* make sure this gets pruned for alpha=1.0 to make room for other nodes at higher distances */
471+
1.01
428472
}
429473
} else {
430-
distance_between_candidate_and_point as f64
431-
/ distance_between_candidate_and_existing_neighbor as f64
432-
};
474+
f64::MAX
475+
}
476+
} else {
477+
distance_between_candidate_and_point as f64
478+
/ distance_between_candidate_and_existing_neighbor as f64
479+
};
480+
433481
max_factors[j] = max_factors[j].max(factor)
434482
}
435483
}
@@ -466,8 +514,13 @@ impl<'a> Graph<'a> {
466514
let meta_page = self.get_meta_page();
467515

468516
//TODO: make configurable?
469-
let v =
470-
self.greedy_search_for_build(vec, meta_page, storage, &mut stats.greedy_search_stats);
517+
let v = self.greedy_search_for_build(
518+
index_pointer,
519+
vec,
520+
meta_page,
521+
storage,
522+
&mut stats.greedy_search_stats,
523+
);
471524

472525
let (_, neighbor_list) = self.add_neighbors(
473526
storage,

pgvectorscale/src/util/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,13 @@ impl ItemPointer {
121121
len: len as _,
122122
}
123123
}
124+
125+
pub fn ip_distance(self, other: Self) -> usize {
126+
let block_diff = (self.block_number as isize - other.block_number as isize).abs() as usize;
127+
let offset_diff = (self.offset as isize - other.offset as isize).abs() as usize;
128+
assert!(offset_diff < pgrx::pg_sys::MaxOffsetNumber as _);
129+
(block_diff * (pgrx::pg_sys::MaxOffsetNumber as usize) + offset_diff) as usize
130+
}
124131
}
125132

126133
pub type IndexPointer = ItemPointer;

0 commit comments

Comments
 (0)