Skip to content

Commit 030fd1c

Browse files
committed
PERF: Improve performance of multivarite OLS
Add short circuits when applicable to multivariate OLS General clean up of small issues closes #158
1 parent 77007ba commit 030fd1c

File tree

10 files changed

+117
-103
lines changed

10 files changed

+117
-103
lines changed

linearmodels/asset_pricing/model.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
Linear factor models for applications in asset pricing
33
"""
44
import numpy as np
5-
from numpy.linalg import pinv
65
from patsy.highlevel import dmatrix
76
from patsy.missing import NAAction
87
from scipy.optimize import minimize
@@ -16,7 +15,7 @@
1615
from linearmodels.compat.numpy import lstsq
1716
from linearmodels.iv.data import IVData
1817
from linearmodels.utility import (AttrDict, WaldTestStatistic, has_constant,
19-
matrix_rank, missing_warning)
18+
missing_warning)
2019

2120

2221
def callback_factory(obj, args, disp=1):
@@ -103,9 +102,9 @@ def _validate_data(self):
103102
raise ValueError('portfolios must not contains a constant or equivalent.')
104103
if has_constant(f)[0]:
105104
raise ValueError('factors must not contain a constant or equivalent.')
106-
if matrix_rank(f) < f.shape[1]:
105+
if np.linalg.matrix_rank(f) < f.shape[1]:
107106
raise ValueError('Model cannot be estimated. factors do not have full column rank.')
108-
if matrix_rank(p) < p.shape[1]:
107+
if np.linalg.matrix_rank(p) < p.shape[1]:
109108
raise ValueError('Model cannot be estimated. portfolios do not have full column rank.')
110109

111110
@property
@@ -212,13 +211,13 @@ def fit(self, cov_type='robust', debiased=True, **cov_config):
212211
fc = np.c_[np.ones((nobs, 1)), f]
213212
rp = f.mean(0)[:, None]
214213
fe = f - f.mean(0)
215-
b = pinv(fc) @ p
214+
b = np.linalg.pinv(fc) @ p
216215
eps = p - fc @ b
217216
alphas = b[:1].T
218217

219218
nloading = (nfactor + 1) * nportfolio
220219
xpxi = np.eye(nloading + nfactor)
221-
xpxi[:nloading, :nloading] = np.kron(np.eye(nportfolio), pinv(fc.T @ fc / nobs))
220+
xpxi[:nloading, :nloading] = np.kron(np.eye(nportfolio), np.linalg.pinv(fc.T @ fc / nobs))
222221
f_rep = np.tile(fc, (1, nportfolio))
223222
eps_rep = np.tile(eps, (nfactor + 1, 1)) # 1 2 3 ... 25 1 2 3 ...
224223
eps_rep = eps_rep.ravel(order='F')
@@ -251,7 +250,7 @@ def fit(self, cov_type='robust', debiased=True, **cov_config):
251250

252251
# Return values
253252
alpha_vcv = vcv[:nportfolio, :nportfolio]
254-
stat = float(alphas.T @ pinv(alpha_vcv) @ alphas)
253+
stat = float(alphas.T @ np.linalg.pinv(alpha_vcv) @ alphas)
255254
jstat = WaldTestStatistic(stat, 'All alphas are 0', nportfolio, name='J-statistic')
256255
params = b.T
257256
betas = b[1:].T

linearmodels/panel/data.py

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from itertools import product
22

33
import numpy as np
4-
import pandas as pd
5-
from pandas import DataFrame, Panel, Series
4+
from pandas import DataFrame, Panel, Series, MultiIndex, get_dummies, Categorical
65

76
from linearmodels.compat.numpy import lstsq
87
from linearmodels.compat.pandas import (is_categorical,
@@ -32,10 +31,10 @@ class _Panel(object):
3231
def __init__(self, df):
3332
self._items = df.columns
3433
index = df.index
35-
self._major_axis = pd.Series(index.levels[1][index.labels[1]]).unique()
36-
self._minor_axis = pd.Series(index.levels[0][index.labels[0]]).unique()
37-
self._full_index = pd.MultiIndex.from_product([self._minor_axis,
38-
self._major_axis])
34+
self._major_axis = Series(index.levels[1][index.labels[1]]).unique()
35+
self._minor_axis = Series(index.levels[0][index.labels[0]]).unique()
36+
self._full_index = MultiIndex.from_product([self._minor_axis,
37+
self._major_axis])
3938
new_df = df.reindex(self._full_index)
4039
self._frame = new_df
4140
i, j, k = len(self._items), len(self._major_axis), len(self.minor_axis)
@@ -45,12 +44,12 @@ def __init__(self, df):
4544
@classmethod
4645
def from_array(cls, values, items, major_axis, minor_axis):
4746
index = list(product(minor_axis, major_axis))
48-
index = pd.MultiIndex.from_tuples(index)
47+
index = MultiIndex.from_tuples(index)
4948
i, j, k = len(items), len(major_axis), len(minor_axis)
5049
values = np.swapaxes(values.copy(), 0, 2).ravel()
5150
values = np.reshape(values, ((j * k), i))
5251

53-
df = pd.DataFrame(values, index=index, columns=items)
52+
df = DataFrame(values, index=index, columns=items)
5453
return cls(df)
5554

5655
@property
@@ -82,7 +81,7 @@ def convert_columns(s, drop_first):
8281
s = s.astype('category')
8382

8483
if is_categorical(s):
85-
out = pd.get_dummies(s, drop_first=drop_first)
84+
out = get_dummies(s, drop_first=drop_first)
8685
out.columns = [str(s.name) + '.' + str(c) for c in out]
8786
return out
8887
return s
@@ -169,18 +168,18 @@ def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True, copy=
169168
except ImportError:
170169
pass
171170

172-
if isinstance(x, Series) and isinstance(x.index, pd.MultiIndex):
171+
if isinstance(x, Series) and isinstance(x.index, MultiIndex):
173172
x = DataFrame(x)
174173
elif isinstance(x, Series):
175174
raise ValueError('Series can only be used with a 2-level MultiIndex')
176175

177176
if isinstance(x, (Panel, DataFrame)):
178177
if isinstance(x, DataFrame):
179-
if isinstance(x.index, pd.MultiIndex):
178+
if isinstance(x.index, MultiIndex):
180179
if len(x.index.levels) != 2:
181180
raise ValueError('DataFrame input must have a '
182181
'MultiIndex with 2 levels')
183-
if isinstance(self._original, (pd.DataFrame, PanelData, pd.Series)):
182+
if isinstance(self._original, (DataFrame, PanelData, Series)):
184183
for i in range(2):
185184
index_names[i] = x.index.levels[i].name or index_names[i]
186185
self._frame = x
@@ -388,9 +387,9 @@ def general_demean(self, groups, weights=None):
388387
if not isinstance(groups, PanelData):
389388
groups = PanelData(groups)
390389
if weights is None:
391-
weights = PanelData(pd.DataFrame(np.ones((self._frame.shape[0], 1)),
392-
index=self.index,
393-
columns=['weights']))
390+
weights = PanelData(DataFrame(np.ones((self._frame.shape[0], 1)),
391+
index=self.index,
392+
columns=['weights']))
394393
weights = weights.values2d
395394
groups = groups.values2d.astype(np.int64, copy=False)
396395

@@ -417,11 +416,11 @@ def demean_pass(frame, weights, root_w):
417416
return frame
418417

419418
# Swap out the index for better performance
420-
init_index = pd.DataFrame(groups)
419+
init_index = DataFrame(groups)
421420
init_index.set_index(list(init_index.columns), inplace=True)
422421

423422
root_w = np.sqrt(weights)
424-
weights = pd.DataFrame(weights, index=init_index.index)
423+
weights = DataFrame(weights, index=init_index.index)
425424
wframe = root_w * self._frame
426425
wframe.index = init_index.index
427426

@@ -619,7 +618,7 @@ def dummies(self, group='entity', drop_first=False):
619618
axis = 0 if group == 'entity' else 1
620619
labels = self._frame.index.labels
621620
levels = self._frame.index.levels
622-
cat = pd.Categorical(levels[axis][labels[axis]])
623-
dummies = pd.get_dummies(cat, drop_first=drop_first)
621+
cat = Categorical(levels[axis][labels[axis]])
622+
dummies = get_dummies(cat, drop_first=drop_first)
624623
cols = self.entities if group == 'entity' else self.time
625624
return dummies[[c for c in cols if c in dummies]].astype(np.float64, copy=False)

linearmodels/system/_utility.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
import numpy as np
22
import pandas as pd
3-
from numpy import cumsum, diag, eye, zeros
4-
from numpy.linalg import inv
5-
6-
from linearmodels.utility import matrix_rank
3+
from numpy.linalg import inv, matrix_rank
74

85

96
def blocked_column_product(x, s):
@@ -85,9 +82,19 @@ def blocked_inner_prod(x, s):
8582
"""
8683
k = len(x)
8784
widths = list(map(lambda m: m.shape[1], x))
88-
cum_width = cumsum([0] + widths)
85+
s_is_diag = np.all((s - np.diag(np.diag(s))) == 0.0)
86+
87+
w0 = widths[0]
88+
homogeneous = all([w == w0 for w in widths])
89+
if homogeneous and not s_is_diag:
90+
# Fast path when all x have same number of columns
91+
# Slower than diag case when k is large since many 0s
92+
x = np.hstack(x)
93+
return x.T @ x * np.kron(s, np.ones((w0, w0)))
94+
95+
cum_width = np.cumsum([0] + widths)
8996
total = sum(widths)
90-
out = zeros((total, total))
97+
out = np.zeros((total, total))
9198

9299
for i in range(k):
93100
xi = x[i]
@@ -97,7 +104,7 @@ def blocked_inner_prod(x, s):
97104
out[sel_i, sel_i] = prod
98105

99106
# Short circuit if identity
100-
if np.all((s - diag(diag(s))) == 0.0):
107+
if s_is_diag:
101108
return out
102109

103110
for i in range(k):
@@ -183,7 +190,7 @@ def blocked_full_inner_product(x, s):
183190
def inv_matrix_sqrt(s):
184191
vecs, vals = np.linalg.eigh(s)
185192
vecs = 1.0 / np.sqrt(vecs)
186-
out = vals @ diag(vecs) @ vals.T
193+
out = vals @ np.diag(vecs) @ vals.T
187194
return (out + out.T) / 2
188195

189196

@@ -261,7 +268,7 @@ def _verify_constraints(self):
261268
def _compute_transform(self):
262269
r = self._ra
263270
c, k = r.shape
264-
m = eye(k) - r.T @ inv(r @ r.T) @ r
271+
m = np.eye(k) - r.T @ inv(r @ r.T) @ r
265272
vals, vecs = np.linalg.eigh(m)
266273
vals = np.real(vals)
267274
vecs = np.real(vecs)

linearmodels/system/covariance.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -181,15 +181,27 @@ def __init__(self, x, eps, sigma, full_sigma, gls=False, debiased=False, constra
181181
self._name = 'Heteroskedastic (Robust) Covariance'
182182

183183
k = len(x)
184-
weights = inv(sigma) if gls else eye(k)
185-
bigx = blocked_diag_product(x, weights)
186184
nobs = eps.shape[0]
187-
e = eps.T.ravel()[:, None]
188-
bigxe = bigx * e
189-
m = bigx.shape[1]
190-
xe = zeros((nobs, m))
191-
for i in range(nobs):
192-
xe[i, :] = bigxe[i::nobs].sum(0)[None, :]
185+
186+
if gls:
187+
weights = inv(sigma)
188+
bigx = blocked_diag_product(x, weights)
189+
e = eps.T.ravel()[:, None]
190+
bigxe = bigx * e
191+
m = bigx.shape[1]
192+
xe = zeros((nobs, m))
193+
for i in range(nobs):
194+
xe[i, :] = bigxe[i::nobs].sum(0)[None, :]
195+
else:
196+
# Do not require blocking when not using GLS
197+
k_tot = sum(map(lambda a: a.shape[1], x))
198+
xe = empty((nobs, k_tot))
199+
loc = 0
200+
for i in range(k):
201+
offset = x[i].shape[1]
202+
xe[:, loc:loc+offset] = x[i] * eps[:, i:i+1]
203+
loc += offset
204+
193205
self._moments = xe
194206

195207
def _xeex(self):

0 commit comments

Comments
 (0)