apache · anuunchin · May 26, 2025 · May 27, 2025 · May 27, 2025 · Jun 1, 2025
diff --git a/scripts/builtin/ampute.dml b/scripts/builtin/ampute.dml
@@ -30,7 +30,6 @@
 # mech         a string [either "MAR", "MNAR", or "MCAR"] specifying the missingness mechanism. Chosen "MAR" and "MNAR" settings will be overridden if a non-default weight matrix is specified
 # weights      a weight matrix [shape: k-by-m], containing weights that will be used to calculate the weighted sum scores. Will be overridden if mech == "MCAR"
 # seed         a manually defined seed for reproducible RNG
-
 # -------------------------------------------------------------------------------------
 #
 # OUTPUT:

diff --git a/scripts/builtin/confusionMatrix.dml b/scripts/builtin/confusionMatrix.dml
@@ -23,7 +23,7 @@
 # and actual labels. We return both the counts and relative frequency
 # (normalized by sum of true labels)
 #
-# .. code-block::
+# .. code-block:: text
 #
 #                   True Labels
 #                     1    2

diff --git a/scripts/builtin/cooccurrenceMatrix.dml b/scripts/builtin/cooccurrenceMatrix.dml
@@ -18,22 +18,21 @@
 # under the License.
 #
 #-------------------------------------------------------------
-#
-# The implementation is based on
+
+# Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
+# Adds an index column to the result. The implementation is based on
 # https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
 #
-#-------------------------------------------------------------
-
-## Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
-## Adds an index column to the result.
 # INPUT:
 # ------------------------------------------------------------------------------
 # S     (Frame[Unknown]): 1D input data frame containing text data.
 # ------------------------------------------------------------------------------
+#
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # result    (Frame[Unknown]): Processed text data with an index column.
 # ------------------------------------------------------------------------------
+
 processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
     print("processText");
     tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");

diff --git a/scripts/builtin/correctTypos.dml b/scripts/builtin/correctTypos.dml
@@ -34,10 +34,10 @@
 #
 # INPUT:
 # ----------------------------------------------------------------------------------------
-# strings              The nx1 input frame of corrupted strings
-# frequency_threshold  Strings that occur above this frequency level will not be corrected
-# distance_threshold   Max distance at which strings are considered similar
-# is_verbose           Print debug information
+# strings               The nx1 input frame of corrupted strings
+# frequency_threshold   Strings that occur above this frequency level will not be corrected
+# distance_threshold    Max distance at which strings are considered similar
+# is_verbose            Print debug information
 # ----------------------------------------------------------------------------------------
 #
 # OUTPUT:

diff --git a/scripts/builtin/decisionTree.dml b/scripts/builtin/decisionTree.dml
@@ -30,9 +30,9 @@
 #   and the following trees, M would look as follows:
 #
 #   (L1)               |d<5|
-#                     /     \
+#                     /     \\
 #   (L2)           P1:2    |a<7|
-#                          /   \
+#                          /   \\
 #   (L3)                 P2:2 P3:1
 #
 #   --> M :=

diff --git a/scripts/builtin/differenceStatistics.dml b/scripts/builtin/differenceStatistics.dml
@@ -28,6 +28,11 @@
 # X        First Matrix to compare
 # Y        Second Matrix to compare
 # --------------------------------------------------------------------------------
+#
+# OUTPUT:
+# -------------------------------------------------------------------------------------
+# stats.   Difference statistics
+# -------------------------------------------------------------------------------------
 
 m_differenceStatistics = function(Matrix[Double] X, Matrix[Double] Y)  {
 

diff --git a/scripts/builtin/dist.dml b/scripts/builtin/dist.dml
@@ -21,6 +21,18 @@
 
 # Returns Euclidean distance matrix (distances between N n-dimensional points)
 #
+# .. code-block:: python
+#
+#   >>> import numpy as np
+#   >>> with SystemDSContext() as sds:
+#   ...     X = sds.from_numpy(np.array([[0], [3], [4]]))
+#   ...     out = dist(X).compute()
+#   ...     print(out)
+#   [[0. 3. 4.]
+#    [3. 0. 1.]
+#    [4. 1. 0.]]
+#
+#
 # INPUT:
 # --------------------------------------------------------------------------------
 # X       Matrix to calculate the distance inside

diff --git a/scripts/builtin/glove.dml b/scripts/builtin/glove.dml
@@ -18,6 +18,31 @@
 # under the License.
 #-------------------------------------------------------------
 
+# Computes the vector embeddings for words in a large text corpus. 
+#
+# INPUT:
+# -------------------------------------------------------------------------------- 
+# input                 1DInput corpus in CSV format.
+# seed                  Random seed for reproducibility.
+# vector_size           Dimensionality of word vectors, V.
+# eta                   Learning rate for optimization, recommended value: 0.05.
+# alpha                 Weighting function parameter, recommended value: 0.75.
+# x_max                 Maximum co-occurrence value as per the GloVe paper: 100.
+# tol                   Tolerance value to avoid overfitting, recommended value: 1e-4.
+# iterations            Total number of training iterations.
+# print_loss_it         Interval (in iterations) for printing the loss.
+# maxTokens             Maximum number of tokens per text entry.
+# windowSize            Context window size.
+# distanceWeighting     Whether to apply distance-based weighting.
+# symmetric             Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
+# ------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ------------------------------------------------------------------------------
+# G                     The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V)
+# ------------------------------------------------------------------------------
+
+
 init = function(matrix[double] cooc_matrix, double x_max, double alpha)
   return(matrix[double] weights, matrix[double] log_cooc_matrix){
   E = 2.718281828;
@@ -119,7 +144,7 @@ gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_i
     G = cbind(cooc_index[,2], as.frame(G));
 }
 
-glove = function(
+f_glove = function(
     Frame[Unknown] input,
     int seed, int vector_size,
     double alpha, double eta,

diff --git a/scripts/builtin/imputeByKNN.dml b/scripts/builtin/imputeByKNN.dml
@@ -25,23 +25,16 @@
 # the missing values by column means. Currently, only the column with the most
 # missing values is actually imputed.
 #
-# ------------------------------------------------------------------------------
 # INPUT:
 # ------------------------------------------------------------------------------
-# X           Matrix with missing values, which are represented as NaNs
-# method      Method used for imputing missing values with different performance
-#             and accuracy tradeoffs:
-#             'dist' (default): Compute all-pairs distances and impute the
-#                               missing values by closest. O(N^2 * #features)
-#             'dist_missing':   Compute distances between data and records with
-#                               missing values. O(N*M * #features), assuming
-#                               that the number of records with MV is M<<N.
-#             'dist_sample':    Compute distances between sample of data and
-#                               records with missing values. O(S*M * #features)
-#                               with M<<N and S<<N, but suboptimal imputation.
-# seed        Root seed value for random/sample calls for deterministic behavior
-#             -1 for true randomization
-# sample_frac Sample fraction for 'dist_sample' (value between 0 and 1)
+# X             Matrix with missing values, which are represented as NaNs
+# method        Method used for imputing missing values with different performance and accuracy tradeoffs:\n
+#               - 'dist' (default): Compute all-pairs distances and impute the missing values by closest. O(N^2 * #features)
+#               - 'dist_missing': Compute distances between data and records with missing values. O(N*M * #features), assuming that the number of records with MV is M<<N.
+#               - 'dist_sample': Compute distances between sample of data and records with missing values. O(S*M * #features) with M<<N and S<<N, but suboptimal imputation.
+#
+# seed          Root seed value for random/sample calls for deterministic behavior. -1 for true randomization
+# sample_frac   Sample fraction for 'dist_sample' (value between 0 and 1)
 # ------------------------------------------------------------------------------
 #
 # OUTPUT:

diff --git a/scripts/builtin/lm.dml b/scripts/builtin/lm.dml
@@ -23,6 +23,30 @@
 # method or the conjugate gradient algorithm depending on the input size
 # of the matrices (See lmDS-function and lmCG-function respectively).
 #
+# .. code-block:: python
+#
+#   >>> import numpy as np
+#   >>> from systemds.context import SystemDSContext
+#   >>> from systemds.operator.algorithm import lm
+#   >>> from sklearn.linear_model import LinearRegression
+#   >>>
+#   >>> np.random.seed(7)
+#   >>> X = np.random.rand(30, 1)
+#   >>> Y = np.random.rand(30, 1)
+#   >>> regressor = LinearRegression(fit_intercept=False)
+#   >>> model = regressor.fit(X, Y).coef_
+#   >>>
+#   >>> with SystemDSContext() as sds:
+#   ...     X_sds = sds.from_numpy(X)
+#   ...     Y_sds = sds.from_numpy(Y)
+#   ...     sds_model_weights = lm(X_sds, Y_sds, verbose=False).compute()
+#   ...     model = model.reshape(sds_model_weights.shape)
+#   ...     eps = 1e-03
+#   ...
+#   ...     print(np.allclose(sds_model_weights, model, eps))
+#   True
+#
+#
 # INPUT:
 # --------------------------------------------------------------------
 # X        Matrix of feature vectors.
@@ -41,6 +65,7 @@
 # B     The model fit beta that can be used as input in lmPredict
 # ---------------------------------------------------------------
 
+
 m_lm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
     Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE)
     return (Matrix[Double] B) {

diff --git a/scripts/builtin/normalize.dml b/scripts/builtin/normalize.dml
@@ -22,6 +22,18 @@
 # Min-max normalization (a.k.a. min-max scaling) to range [0,1]. For matrices 
 # of positive values, this normalization preserves the input sparsity.
 #
+# .. code-block:: python
+#
+#   >>> import numpy as np
+#   >>> from systemds.context import SystemDSContext
+#   >>> with SystemDSContext() as sds:
+#   ...     X = sds.from_numpy(np.array([[1, 2], [3, 4]]))
+#   ...     Y, cmin, cmax = normalize(X).compute()
+#   ...     print(Y)
+#   [[0. 0.]
+#    [1. 1.]]
+#
+#
 # INPUT:
 # ---------------------------------------------------------------------------------------
 # X     Input feature matrix of shape n-by-m

diff --git a/scripts/builtin/quantizeByCluster.dml b/scripts/builtin/quantizeByCluster.dml
@@ -58,7 +58,7 @@
 #           the product quantization. Only relevant when space_decomp = TRUE.
 # ------------------------------------------------------------------------------------------
 
-m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, Integer runs = 10,
+m_quantizeByCluster = function(Matrix[Double] X, Integer M = 4, Integer k = 10, Integer runs = 10,
     Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean space_decomp=FALSE, Integer seed = -1)
   return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R)
 {

diff --git a/scripts/builtin/randomForest.dml b/scripts/builtin/randomForest.dml
@@ -26,16 +26,17 @@
 # and optionally subset of features (columns). During tree construction, split
 # candidates are additionally chosen on a sample of remaining features.
 #
-# .. code-block::
+# .. code-block:: text
 #
 #   For example, given a feature matrix with features [a,b,c,d]
 #   and the following two trees, M (the output) would look as follows:
 #
 #   (L1)          |a<7|                   |d<5|
-#                /     \                 /     \
+#                /     \\                 /     \\
 #   (L2)     |c<3|     |b<4|         |a<7|     P3:2
-#            /   \     /   \         /   \
+#            /   \\     /   \\         /  \\
 #   (L3)   P1:2 P2:1 P3:1 P4:2     P1:2 P2:1
+#
 #   --> M :=
 #   [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2],  (1st tree)
 #    [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]]  (2nd tree)
@@ -46,6 +47,48 @@
 #   (e.g., [1,1,1,0] if we sampled a,b,c of the four features)
 #
 #
+# .. code-block:: python
+#
+#   >>> import numpy as np
+#   >>> from systemds.context import SystemDSContext
+#   >>> from systemds.operator.algorithm import randomForest, randomForestPredict
+#   >>>
+#   >>> # tiny toy dataset
+#   >>> X = np.array([[1],
+#   ...               [2],
+#   ...               [10],
+#   ...               [11]], dtype=np.int64)
+#   >>> y = np.array([[1],
+#   ...               [1],
+#   ...               [2],
+#   ...               [2]], dtype=np.int64)
+#   >>>
+#   >>> with SystemDSContext() as sds:
+#   ...     X_sds = sds.from_numpy(X)
+#   ...     y_sds = sds.from_numpy(y)
+#   ...
+#   ...     ctypes = sds.from_numpy(np.array([[1, 2]], dtype=np.int64))
+#   ...
+#   ...     # train a 4-tree forest (no sampling)
+#   ...     M = randomForest(
+#   ...             X_sds, y_sds, ctypes,
+#   ...             num_trees    = 4,
+#   ...             sample_frac  = 1.0,
+#   ...             feature_frac = 1.0,
+#   ...             max_depth    = 3,
+#   ...             min_leaf     = 1,
+#   ...             min_split    = 2,
+#   ...             seed         = 42
+#   ...          )
+#   ...
+#   ...     preds = randomForestPredict(X_sds, ctypes, M).compute()
+#   ...     print(preds)
+#   [[1.]
+#    [1.]
+#    [2.]
+#    [2.]]
+#
+#
 # INPUT:
 # ------------------------------------------------------------------------------
 # X               Feature matrix in recoded/binned representation

diff --git a/scripts/builtin/shapExplainer.dml b/scripts/builtin/shapExplainer.dml
@@ -51,6 +51,7 @@
 # S              Matrix holding the shapley values along the cols, one row per instance.
 # expected       Double holding the average prediction of all instances.
 # -----------------------------------------------------------------------------
+
 s_shapExplainer = function(String model_function, list[unknown] model_args, Matrix[Double] x_instances,
     Matrix[Double] X_bg, Integer n_permutations = 10, Integer n_samples = 100, Integer remove_non_var=0,
     Matrix[Double] partitions=as.matrix(-1), Integer seed = -1, Integer verbose = 0)

diff --git a/scripts/builtin/toOneHot.dml b/scripts/builtin/toOneHot.dml
@@ -21,10 +21,24 @@
 
 # The toOneHot-function encodes unordered categorical vector to multiple binary vectors.
 #
+# .. code-block:: python
+#
+#   >>> import numpy as np
+#   >>> from systemds.context import SystemDSContext
+#   >>> with SystemDSContext() as sds:
+#   ...     X = sds.from_numpy(np.array([[1], [3], [2], [3]]))
+#   ...     Y = toOneHot(X, numClasses=3).compute()
+#   ...     print(Y)
+#   [[1. 0. 0.]
+#    [0. 0. 1.]
+#    [0. 1. 0.]
+#    [0. 0. 1.]]
+#
+#
 # INPUT:
 # ------------------------------------------------------------------------------------------
 # X           Vector with N integer entries between 1 and numClasses
-# numclasses  Number of columns, must be be greater than or equal to largest value in X
+# numClasses  Number of columns, must be be greater than or equal to largest value in X
 # ------------------------------------------------------------------------------------------
 #
 # OUTPUT:

diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml
@@ -21,6 +21,17 @@
 
 # This function cleans top-K item (where K is given as input)for a given list of users.
 # metaData[3, ncol(X)] : metaData[1] stores mask, metaData[2] stores schema, metaData[3] stores FD mask
+#
+# INPUT:
+#-------------------------------------------------------------------------------
+# TODO  TODO
+#-------------------------------------------------------------------------------
+#
+# OUTPUT:
+#-------------------------------------------------------------------------------
+# TODO  TODO
+#-------------------------------------------------------------------------------
+
 
 source("scripts/pipelines/scripts/utils.dml") as utils;
 source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;

diff --git a/src/main/python/docs/README.md b/src/main/python/docs/README.md
@@ -39,4 +39,4 @@ and then run `make html`:
 make html
 ```
 
-The docs will then be created at: `/src/main/python/build`in HTML will be placed in the `./_build` directory.
+The docs will then be created at: `/src/main/python/docs/build/html/`.
diff --git a/src/main/python/docs/requires-docs.txt b/src/main/python/docs/requires-docs.txt
@@ -24,4 +24,5 @@ sphinx_rtd_theme
 numpy
 py4j
 scipy
-requests
+requests
+pandas
diff --git a/src/main/python/docs/source/api/operator/algorithms.rst b/src/main/python/docs/source/api/operator/algorithms.rst
@@ -66,4 +66,9 @@ The output should be similar to
   [ 0.37957689]]
 
 .. automodule:: systemds.operator.algorithm
-  :members:
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+
+   algorithms/*
diff --git a/src/main/python/docs/source/api/operator/algorithms/WoE.rst b/src/main/python/docs/source/api/operator/algorithms/WoE.rst
@@ -0,0 +1,4 @@
+WoE
+====
+
+.. autofunction:: systemds.operator.algorithm.WoE
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,4 +24,5 @@ sphinx_rtd_theme @@
     numpy
     py4j
     scipy
-    requests
+    requests
+    pandas