Merge pull request #11 from Mye-InfoBank/per-dataset-resolution

nictru · web-flow · commit 83f4335b86ac · 2025-08-26T19:06:30.000+02:00
Implement per-dataset resolution
diff --git a/README.md b/README.md
@@ -112,7 +112,18 @@ This includes only two steps:
 
 ### Step 3: Find unification opportunities
 
-Currently, there is only one approach implemented.
+Currently, there are two approaches implemented, applied in sequence.
+
+#### Resolve per dataset
+
+For each unapproved symbol, this step looks at each dataset individually to see if there's a clear resolution within that specific dataset context. The logic works as follows:
+
+1. For each unapproved symbol, examine each dataset that contains it
+2. Look at all the approved symbols that the unapproved symbol connects to (its neighbors in the graph)
+3. Check which of these approved neighbors are **not** present in the current dataset
+4. If exactly one approved neighbor is missing from the dataset, rename the unapproved symbol to that missing approved symbol in this specific dataset
+
+This approach handles cases where an unapproved symbol has multiple potential approved targets, but the dataset context makes the choice clear. For example, if an unapproved symbol connects to three approved symbols, but two of them are already present in a particular dataset, then the third one is the obvious choice for that dataset.
 
 #### Resolve unapproved symbols
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "hugo-unifier"
-version = "0.2.8"
+version = "0.3.0"
 description = "Python package that can unify gene symbols across datasets based on the HUGO database."
 readme = "README.md"
 authors = [
diff --git a/src/hugo_unifier/get_changes.py b/src/hugo_unifier/get_changes.py
@@ -9,6 +9,7 @@
     remove_self_edges,
     remove_loose_ends,
     resolve_unapproved,
+    resolve_per_dataset,
 )
 
 
@@ -54,6 +55,7 @@ def get_changes(
     remove_loose_ends(G)
 
     graph_manipulations: List[Callable[[nx.DiGraph, pd.DataFrame]]] = [
+        resolve_per_dataset,
         resolve_unapproved,
         # aggregate_approved,
     ]
diff --git a/src/hugo_unifier/graph_manipulations.py b/src/hugo_unifier/graph_manipulations.py
@@ -55,6 +55,37 @@ def __decide_successor__(G: nx.DiGraph, node: str, df: pd.DataFrame) -> str:
     return None
 
 
+def resolve_per_dataset(G: nx.DiGraph, df: pd.DataFrame) -> pd.DataFrame:
+    # Iterate all unapproved nodes
+    # For each sample, check if all except one of the approved symbols are also present in the sample
+    # If so, rename the unapproved symbol to the last remaining approved symbol
+
+    for node in list(G.nodes()):
+        if G.nodes[node]["type"] == "approvedSymbol":
+            continue
+
+        samples = list(G.nodes[node]["samples"])
+        for sample in samples:
+            non_used_neighbors = []
+            for neighbor in G.neighbors(node):
+                if sample not in G.nodes[neighbor]["samples"]:
+                    non_used_neighbors.append(neighbor)
+
+            if len(non_used_neighbors) == 1:
+                target_neighbor = non_used_neighbors[0]
+                df.loc[len(df)] = [
+                    sample,
+                    "rename",
+                    node,
+                    target_neighbor,
+                    f"The unapproved symbol {node} is present in {sample} and only one of its approved neighbors ({target_neighbor}) is not also present in {sample}. Therefore, renaming {node} to {target_neighbor} in {sample}.",
+                ]
+                G.nodes[target_neighbor]["samples"].add(sample)
+                G.nodes[node]["samples"].remove(sample)
+
+    return df
+
+
 def resolve_unapproved(G: nx.DiGraph, df: pd.DataFrame) -> pd.DataFrame:
     for node in list(G.nodes()):
         if G.nodes[node]["type"] == "approvedSymbol":
diff --git a/tests/test_get_changes.py b/tests/test_get_changes.py
@@ -24,14 +24,23 @@ def test_cox1_and_co1():
     assert len(sample_changes) == 2
 
     sample1_changes = sample_changes["sample1"]
-    assert len(sample1_changes) == 1
-    assert sample1_changes.iloc[0]["action"] == "copy"
-    assert sample1_changes.iloc[0]["symbol"] == "COX1"
-    assert sample1_changes.iloc[0]["new"] == "MT-CO1"
+    assert len(sample1_changes) == 0
 
     sample2_changes = sample_changes["sample2"]
     assert len(sample2_changes) == 1
-    assert sample2_changes.iloc[0]["action"] == "conflict"
+    assert sample2_changes.iloc[0]["action"] == "rename"
+
+
+def test_cox1_co1_and_ptgs1():
+    sample_symbols = {"sample1": ["MT-CO1"], "sample2": ["COX1", "PTGS1"]}
+    _, sample_changes = get_changes(sample_symbols)
+
+    assert len(sample_changes) == 2
+    assert len(sample_changes["sample1"]) == 0
+    assert len(sample_changes["sample2"]) == 1
+    assert sample_changes["sample2"].iloc[0]["action"] == "rename"
+    assert sample_changes["sample2"].iloc[0]["symbol"] == "COX1"
+    assert sample_changes["sample2"].iloc[0]["new"] == "MT-CO1"
 
 
 def test_single_sample():
@@ -88,14 +97,11 @@ def test_cox2_and_co2():
     assert len(sample_changes) == 2
 
     sample1_changes = sample_changes["sample1"]
-    assert len(sample1_changes) == 1
-    assert sample1_changes.iloc[0]["action"] == "copy"
-    assert sample1_changes.iloc[0]["symbol"] == "COX2"
-    assert sample1_changes.iloc[0]["new"] == "MT-CO2"
+    assert len(sample1_changes) == 0
 
     sample2_changes = sample_changes["sample2"]
     assert len(sample2_changes) == 1
-    assert sample2_changes.iloc[0]["action"] == "conflict"
+    assert sample2_changes.iloc[0]["action"] == "rename"
 
 
 def test_cox3():
@@ -114,15 +120,15 @@ def test_cox3():
     assert len(sample2_changes) == 0
 
 
-def test_cox3_and_co1():
+def test_cox3_and_co3():
     sample_symbols = {"sample1": ["COX3"], "sample2": ["MT-CO3", "COX3"]}
 
     _, sample_changes = get_changes(sample_symbols)
     assert len(sample_changes) == 2
 
     sample1_changes = sample_changes["sample1"]
     assert len(sample1_changes) == 1
-    assert sample1_changes.iloc[0]["action"] == "copy"
+    assert sample1_changes.iloc[0]["action"] == "rename"
     assert sample1_changes.iloc[0]["symbol"] == "COX3"
     assert sample1_changes.iloc[0]["new"] == "MT-CO3"
 
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`remove_self_edges,`
`10`	`10`	`remove_loose_ends,`
`11`	`11`	`resolve_unapproved,`
	`12`	`+ resolve_per_dataset,`
`12`	`13`	`)`
`13`	`14`
`14`	`15`
`@@ -54,6 +55,7 @@ def get_changes(`
`54`	`55`	`remove_loose_ends(G)`
`55`	`56`
`56`	`57`	`graph_manipulations: List[Callable[[nx.DiGraph, pd.DataFrame]]] = [`
	`58`	`+ resolve_per_dataset,`
`57`	`59`	`resolve_unapproved,`
`58`	`60`	`# aggregate_approved,`
`59`	`61`	`]`