Merge pull request #140 from fedarko/fix-139

mortonjt · web-flow · commit 0e7ae547062a · 2020-10-08T08:39:24.000-06:00
Fix, test, and document filtering bug
diff --git a/README.md b/README.md
@@ -604,7 +604,7 @@ The larger the batch size, the more samples you average per iteration, but the l
 all samples that have less than 1000 reads will be filtered out.
 
 `--min-feature-count` will filter out features according to how many __samples__ they appear in.  For instance, if `--min-feature-count 10` is specified,
-then all features than appear in less than 10 samples will be thrown out.  It is important to note that we are filtering according to number of samples rather than number of reads.  The reason why this behavior is chosen relates to a rule of thumb commonly used in linear regression - if a microbe appears in less than 10 samples, it is difficult to fit a meaningful line for that microbe.  In other words, there is not even resolution in the study to say anything meaningful about that microbe in the context of differential abundance analysis.  The `--min-feature-count` filter is applied _after_ the `--min-sample-count` is applied.
+then all features than appear in less than 10 samples will be thrown out.  It is important to note that we are filtering according to number of samples rather than number of reads.  The reason why this behavior is chosen relates to a rule of thumb commonly used in linear regression - if a microbe appears in less than 10 samples, it is difficult to fit a meaningful line for that microbe.  In other words, there is not even resolution in the study to say anything meaningful about that microbe in the context of differential abundance analysis.  The `--min-feature-count` filter is applied _after_ the `--min-sample-count` is applied, so it's possible for (for example) a sample to get filtered out which in turn causes a feature to get filtered out.
 
 ## 7.4. FAQs: Output files
 
diff --git a/songbird/tests/test_util.py b/songbird/tests/test_util.py
@@ -210,6 +210,94 @@ def test_match_and_filter_big_table(self):
         self.assertEqual(res_design.shape[0], drop_design.shape[0])
         self.assertEqual(res_metadata.shape[0], drop_metadata.shape[0])
 
+    def test_match_and_filter_exact_minimum_feature_count_used(self):
+        formula = 'C(categorical) + continuous'
+
+        # None of the features should be dropped when min_feature_count is 2 or
+        # below, since all features occur in at least two samples.
+        res = match_and_filter(self.big_table, self.metadata, formula,
+                               min_sample_count=0, min_feature_count=2)
+
+        self.assertEqual(
+            set(res[0].ids("observation")),
+            set(self.big_table.ids("observation"))
+        )
+        # Samples should be unchanged (well, s9 gets filtered because it's not
+        # in the metadata, but everything else is kept)
+        self.assertEqual(
+            set(res[0].ids()),
+            set(['s1', 's2', 's3', 's4', 's5', 's6'])
+        )
+
+        # When we bump min_feature_count up to 3, though, stuff should start
+        # getting filtered -- the last three features in self.big_table all
+        # occur in just two samples each.
+        res = match_and_filter(self.big_table, self.metadata, formula,
+                               min_sample_count=0, min_feature_count=3)
+        self.assertEqual(
+            set(res[0].ids("observation")),
+            set(['o1', 'o2', 'o3', 'o4'])
+        )
+        # Again, non-s9 samples unchanged because min sample count is 0
+        self.assertEqual(
+            set(res[0].ids()),
+            set(['s1', 's2', 's3', 's4', 's5', 's6'])
+        )
+
+    def test_match_and_filter_exact_minimum_sample_count_used(self):
+        formula = 'C(categorical) + continuous'
+
+        # None of the samples (except s9, which isn't in the metadata)
+        # should be dropped when min_sample_count is 3 or
+        # below, since all samples have a total count of at least 3.
+        res = match_and_filter(self.big_table, self.metadata, formula,
+                               min_sample_count=3, min_feature_count=0)
+
+        self.assertEqual(
+            set(res[0].ids()),
+            set(self.big_table.ids()) - set(["s9"])
+        )
+        # Features should remain unchanged due to min feature count being 0.
+        self.assertEqual(
+            set(res[0].ids("observation")),
+            set(self.big_table.ids("observation"))
+        )
+
+        # When we bump min_feature_count up to 4, s2 and s4 should get filtered
+        # since they each have a total count of 3
+        res = match_and_filter(self.big_table, self.metadata, formula,
+                               min_sample_count=4, min_feature_count=0)
+        self.assertEqual(set(res[0].ids()), set(['s1', 's3', 's5', 's6']))
+        # Features should remain unchanged due to min feature count being 0.
+        self.assertEqual(
+            set(res[0].ids("observation")),
+            set(self.big_table.ids("observation"))
+        )
+
+    def test_match_and_filter_exact_minima_together_in_sequence(self):
+        formula = 'C(categorical) + continuous'
+
+        res = match_and_filter(self.big_table, self.metadata, formula,
+                               min_sample_count=4, min_feature_count=4)
+
+        # Since min_sample_count is 4, s2 and s4 get filtered out due to having
+        # a total count of 3
+        self.assertEqual(
+            set(res[0].ids()),
+            set(['s1', 's3', 's5', 's6'])
+        )
+        # Since min_feature_count is 4, o4, o5, o6, and o7 all get filtered out
+        # since they were present in 3, 2, 2, and 2 samples respectively
+        # *BEFORE* the sample filtering was done. However, in addition to this,
+        # o2 and o3 will get filtered out because (since we filtered out s2 and
+        # s4) they are now not present in 4 samples! And as match_and_filter's
+        # docs explain, the sample filtering is done first, and this can impact
+        # the feature filtering. So... "oops, all o1".
+        self.assertEqual(
+            set(res[0].ids("observation")),
+            set(['o1'])
+        )
+
     def test_split_training_random(self):
         np.random.seed(0)
         design = pd.DataFrame(
diff --git a/songbird/util.py b/songbird/util.py
@@ -135,6 +135,11 @@ def match_and_filter(table, metadata, formula,
 
     This will also return the patsy representation.
 
+    NOTE that this does sample filtering before read filtering -- it's possible
+    that this could impact the results in unintuitive ways, for example a
+    sample is filtered out which causes a feature to then be filtered out for
+    not being present in enough samples.
+
     Parameters
     ----------
     table : biom.Table
@@ -148,14 +153,17 @@ def match_and_filter(table, metadata, formula,
         Filtered biom table
     metadata : pd.DataFrame
         Sample metadata
+    design : patsy.DesignMatrix
+        Design matrix created from the formula and filtered metadata
     """
-    # match them
-
+    # Use >= so that samples with exactly "min_sample_count" counts, or
+    # features present in exactly "min_feature_count" samples, are *not*
+    # filtered out.
     def sample_filter(val, id_, md):
-        return id_ in metadata.index and np.sum(val) > min_sample_count
+        return id_ in metadata.index and np.sum(val) >= min_sample_count
 
     def read_filter(val, id_, md):
-        return np.sum(val > 0) > min_feature_count
+        return np.sum(val > 0) >= min_feature_count
 
     table = table.filter(sample_filter, axis='sample', inplace=False)
     table = table.filter(read_filter, axis='observation', inplace=False)