BUG: Fix small bugs due to singleton dropping

bashtage · bashtage · commit 1345f1b1b41c · 2019-03-12T22:32:48.000Z
Correct indexation of cluster variable
Ensure collected effects always returns the expected type
Refactor duplicated and wrong code
Fix dummy creation to ensure dtype can handle data
Extend test data generation to handle sparse panels
Add extended tests of singleton dropping
diff --git a/.travis.yml b/.travis.yml
@@ -47,7 +47,7 @@ matrix:
     - PANDAS=0.22
     - XARRAY=0.10
     - STATSMODELS=0.9
-  - python: 3.6
+  - python: 3.7
     env:
     - PYTHON=3.7
     - NUMPY=1.15
@@ -56,7 +56,7 @@ matrix:
     - XARRAY=0.10
     - STATSMODELS=0.9
     - OPENBLAS=1
-  - python: 3.6
+  - python: 3.7
     env:
     - PYTHON=3.7
     - USE_PYPI=true
diff --git a/linearmodels/panel/model.py b/linearmodels/panel/model.py
@@ -566,7 +566,7 @@ def _choose_cov(self, cov_type, **cov_config):
             else:
                 clusters = pd.DataFrame(group_ids)
         if self._singleton_index is not None and clusters is not None:
-            clusters = clusters.loc(~self._singleton_index)
+            clusters = clusters.loc[~self._singleton_index]
 
         cov_config_upd['clusters'] = np.asarray(clusters) if clusters is not None else clusters
 
@@ -780,6 +780,8 @@ def __init__(self, dependent, exog, *, weights=None, entity_effects=False, time_
         self._drop_singletons()
 
     def _collect_effects(self):
+        if not self._has_effect:
+            return np.empty((self.dependent.shape[0], 0))
         effects = []
         if self.entity_effects:
             effects.append(np.asarray(self.dependent.entity_ids).squeeze())
@@ -792,8 +794,7 @@ def _collect_effects(self):
         return np.column_stack(effects)
 
     def _drop_singletons(self):
-        has_effects = self.entity_effects or self.time_effects or self.other_effects is not None
-        if self._singletons or not has_effects:
+        if self._singletons or not self._has_effect:
             return
         effects = self._collect_effects()
         retain = in_2core_graph(effects)
@@ -1187,12 +1188,7 @@ def _determine_df_adjustment(self, cov_type, **cov_config):
         if clusters is None:  # No clusters
             return True
 
-        effects = [self._other_effect_cats] if self.other_effects else []
-        if self.entity_effects:
-            effects.append(self.dependent.entity_ids)
-        if self.time_effects:
-            effects.append(self.dependent.time_ids)
-        effects = np.column_stack(effects)
+        effects = self._collect_effects()
         if num_effects == 1:
             return not self._is_effect_nested(effects, clusters)
         return True  # Default case for 2-way -- not completely clear
diff --git a/linearmodels/panel/utility.py b/linearmodels/panel/utility.py
@@ -48,7 +48,7 @@ def dummy_matrix(cats, format='csc', drop='first', drop_all=False):
         rows = np.arange(nobs)
         ucats, inverse = np.unique(codes[:, i], return_inverse=True)
         ncategories = len(ucats)
-        bits = min([i for i in (8, 16, 32, 64) if i - 1 > np.log2(ncategories)])
+        bits = min([i for i in (8, 16, 32, 64) if i - 1 > np.log2(ncategories + total_dummies)])
         replacements = np.arange(ncategories, dtype='int{:d}'.format(bits))
         cols = replacements[inverse]
         if i == 0 and not drop_all:
@@ -91,10 +91,10 @@ def _remove_node(node, meta, orig_dest):
     node : int
         ID of the node to remove
     meta : ndarray
-        Array with rows containins node, count, and address where
-        address is used to find the first occurence in orig_desk
+        Array with rows containing node, count, and address where
+        address is used to find the first occurrence in orig_desk
     orig_dest : ndarray
-        Array with rows containins origin and destination nodes
+        Array with rows containing origin and destination nodes
 
     Returns
     -------
@@ -145,10 +145,10 @@ def _drop_singletons(meta, orig_dest):
     Parameters
     ----------
     meta : ndarray
-        Array with rows containins node, count, and address where
-        address is used to find the first occurence in orig_desk
+        Array with rows containing node, count, and address where
+        address is used to find the first occurrence in orig_desk
     orig_dest : ndarray
-        Array with rows containins origin and destination nodes
+        Array with rows containing origin and destination nodes
     """
     for i in range(meta.shape[0]):
         if meta[i, 1] == 1:
diff --git a/linearmodels/tests/panel/_utility.py b/linearmodels/tests/panel/_utility.py
@@ -57,7 +57,8 @@ def lsdv(y: DataFrame, x: DataFrame, has_const=False, entity=False, time=False,
     return params[:nvar]
 
 
-def generate_data(missing, datatype, const=False, ntk=(971, 7, 5), other_effects=0, rng=None):
+def generate_data(missing, datatype, const=False, ntk=(971, 7, 5), other_effects=0, rng=None,
+                  num_cats=4):
     if rng is None:
         np.random.seed(12345)
     else:
@@ -75,7 +76,13 @@ def generate_data(missing, datatype, const=False, ntk=(971, 7, 5), other_effects
     else:
         cats = ['cat.' + str(i) for i in range(other_effects)]
     if other_effects:
-        c = np.random.randint(0, 4, (other_effects, t, n))
+        if not isinstance(num_cats, list):
+            num_cats = [num_cats] * other_effects
+        c = []
+        for i in range(other_effects):
+            nc = num_cats[i]
+            c.append(np.random.randint(0, nc, (1, t, n)))
+        c = np.concatenate(c, 0)
 
     vcats = ['varcat.' + str(i) for i in range(2)]
     vc2 = np.ones((2, t, 1)) @ np.random.randint(0, n // 2, (2, 1, n))
diff --git a/linearmodels/tests/panel/test_panel_ols.py b/linearmodels/tests/panel/test_panel_ols.py
@@ -36,6 +36,17 @@ def large_data(request):
     return generate_data(missing, datatype, const=const, ntk=(51, 71, 5), other_effects=2)
 
 
+singleton_ids = [i for i, p in zip(ids, perms) if p[1] == 'pandas' and not p[-1]]
+singleton_perms = [p for p in perms if p[1] == 'pandas' and not p[-1]]
+
+
+@pytest.fixture(params=singleton_perms, ids=singleton_ids)
+def singleton_data(request):
+    missing, datatype, const = request.param
+    return generate_data(missing, datatype, const=const, ntk=(91, 15, 5), other_effects=2,
+                         num_cats=[5 * 91, 15])
+
+
 perms = list(product(missing, datatypes))
 ids = list(map(lambda s: '-'.join(map(str, s)), perms))
 
@@ -1118,3 +1129,31 @@ def test_masked_singleton_removal():
     mod = PanelOLS(y, x, singletons=False, entity_effects=True, time_effects=True)
     res = mod.fit()
     assert res.nobs == 6
+
+
+def test_singleton_removal_other_effects(data):
+    mod_keep = PanelOLS(data.y, data.x, weights=data.w, other_effects=data.c, singletons=True)
+    res_keep = mod_keep.fit()
+
+    mod = PanelOLS(data.y, data.x, weights=data.w, other_effects=data.c, singletons=False)
+    res = mod.fit(cov_type='clustered', clusters=data.vc1)
+
+    assert res.nobs <= res_keep.nobs
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize('other_effects', [1, 2])
+def test_singleton_removal_mixed(singleton_data, other_effects):
+    if other_effects == 1:
+        other_effects = PanelData(singleton_data.c).dataframe.iloc[:, [0]]
+    elif other_effects == 2:
+        other_effects = singleton_data.c
+    mod = PanelOLS(singleton_data.y, singleton_data.x,
+                   other_effects=other_effects)
+    res_keep = mod.fit(use_lsmr=True)
+
+    mod = PanelOLS(singleton_data.y, singleton_data.x,
+                   other_effects=other_effects, singletons=False)
+    res = mod.fit(cov_type='clustered', clusters=singleton_data.vc2, use_lsmr=True)
+    assert_allclose(res_keep.params, res.params)
+    assert res.nobs <= res_keep.nobs