apache
diff --git a/‎scripts/builtin/confusionMatrix.dml
Lines changed: 1 addition & 1 deletion b/‎scripts/builtin/confusionMatrix.dml
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/builtin/cooccurrenceMatrix.dml
Lines changed: 3 additions & 2 deletions b/‎scripts/builtin/cooccurrenceMatrix.dml
Lines changed: 3 additions & 2 deletions
diff --git a/‎scripts/builtin/correctTypos.dml
Lines changed: 4 additions & 4 deletions b/‎scripts/builtin/correctTypos.dml
Lines changed: 4 additions & 4 deletions
diff --git a/‎scripts/builtin/decisionTree.dml
Lines changed: 2 additions & 2 deletions b/‎scripts/builtin/decisionTree.dml
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/builtin/dist.dml
Lines changed: 16 additions & 0 deletions b/‎scripts/builtin/dist.dml
Lines changed: 16 additions & 0 deletions
diff --git a/‎scripts/builtin/imputeByKNN.dml
Lines changed: 8 additions & 15 deletions b/‎scripts/builtin/imputeByKNN.dml
Lines changed: 8 additions & 15 deletions
diff --git a/‎scripts/builtin/quantizeByCluster.dml
Lines changed: 1 addition & 1 deletion b/‎scripts/builtin/quantizeByCluster.dml
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/builtin/randomForest.dml
Lines changed: 4 additions & 3 deletions b/‎scripts/builtin/randomForest.dml
Lines changed: 4 additions & 3 deletions
diff --git a/‎scripts/builtin/shapExplainer.dml
Lines changed: 1 addition & 0 deletions b/‎scripts/builtin/shapExplainer.dml
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/builtin/topk_cleaning.dml
Lines changed: 11 additions & 0 deletions b/‎scripts/builtin/topk_cleaning.dml
Lines changed: 11 additions & 0 deletions
@@ -23,7 +23,7 @@
 # and actual labels. We return both the counts and relative frequency
 # (normalized by sum of true labels)
 #
-# .. code-block::
+# .. code-block:: text
 #
 #                   True Labels
 #                     1    2
 
@@ -19,8 +19,8 @@
 #
 #-------------------------------------------------------------
 
-## Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
-## Adds an index column to the result. The implementation is based on
+# Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
+# Adds an index column to the result. The implementation is based on
 # https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
 #
 # INPUT:
@@ -32,6 +32,7 @@
 # ------------------------------------------------------------------------------
 # result    (Frame[Unknown]): Processed text data with an index column.
 # ------------------------------------------------------------------------------
+
 processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
     print("processText");
     tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");
 
@@ -34,10 +34,10 @@
 #
 # INPUT:
 # ----------------------------------------------------------------------------------------
-# strings              The nx1 input frame of corrupted strings
-# frequency_threshold  Strings that occur above this frequency level will not be corrected
-# distance_threshold   Max distance at which strings are considered similar
-# is_verbose           Print debug information
+# strings               The nx1 input frame of corrupted strings
+# frequency_threshold   Strings that occur above this frequency level will not be corrected
+# distance_threshold    Max distance at which strings are considered similar
+# is_verbose            Print debug information
 # ----------------------------------------------------------------------------------------
 #
 # OUTPUT:
 
@@ -30,9 +30,9 @@
 #   and the following trees, M would look as follows:
 #
 #   (L1)               |d<5|
-#                     /     \
+#                     /     \\
 #   (L2)           P1:2    |a<7|
-#                          /   \
+#                          /   \\
 #   (L3)                 P2:2 P3:1
 #
 #   --> M :=
 
@@ -31,6 +31,22 @@
 # Y      Euclidean distance matrix
 # -----------------------------------------------------------------------------------------------
 
+
+# .. code-block:: python
+# import numpy as np
+# from systemds.context import SystemDSContext
+# from numpy.testing import assert_allclose
+# with SystemDSContext() as sds:
+#     X = sds.from_numpy(np.array([[0], [3], [4]]))
+#     out = dist(X).compute()
+#     expected = np.array([
+#         [0, 3, 4],
+#         [3, 0, 1],
+#         [4, 1, 0]
+#     ])
+#     assert_allclose(out, expected)
+
+
 m_dist = function(Matrix[Double] X) return (Matrix[Double] Y) {
   n = nrow(X)
   s = rowSums(X^2)
 
@@ -25,23 +25,16 @@
 # the missing values by column means. Currently, only the column with the most
 # missing values is actually imputed.
 #
-# ------------------------------------------------------------------------------
 # INPUT:
 # ------------------------------------------------------------------------------
-# X           Matrix with missing values, which are represented as NaNs
-# method      Method used for imputing missing values with different performance
-#             and accuracy tradeoffs:
-#             'dist' (default): Compute all-pairs distances and impute the
-#                               missing values by closest. O(N^2 * #features)
-#             'dist_missing':   Compute distances between data and records with
-#                               missing values. O(N*M * #features), assuming
-#                               that the number of records with MV is M<<N.
-#             'dist_sample':    Compute distances between sample of data and
-#                               records with missing values. O(S*M * #features)
-#                               with M<<N and S<<N, but suboptimal imputation.
-# seed        Root seed value for random/sample calls for deterministic behavior
-#             -1 for true randomization
-# sample_frac Sample fraction for 'dist_sample' (value between 0 and 1)
+# X             Matrix with missing values, which are represented as NaNs
+# method        Method used for imputing missing values with different performance and accuracy tradeoffs:\n
+#               - 'dist' (default): Compute all-pairs distances and impute the missing values by closest. O(N^2 * #features)
+#               - 'dist_missing': Compute distances between data and records with missing values. O(N*M * #features), assuming that the number of records with MV is M<<N.
+#               - 'dist_sample': Compute distances between sample of data and records with missing values. O(S*M * #features) with M<<N and S<<N, but suboptimal imputation.
+#
+# seed          Root seed value for random/sample calls for deterministic behavior. -1 for true randomization
+# sample_frac   Sample fraction for 'dist_sample' (value between 0 and 1)
 # ------------------------------------------------------------------------------
 #
 # OUTPUT:
 
@@ -58,7 +58,7 @@
 #           the product quantization. Only relevant when space_decomp = TRUE.
 # ------------------------------------------------------------------------------------------
 
-m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, Integer runs = 10,
+m_quantizeByCluster = function(Matrix[Double] X, Integer M = 4, Integer k = 10, Integer runs = 10,
     Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean space_decomp=FALSE, Integer seed = -1)
   return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R)
 {
 
@@ -26,16 +26,17 @@
 # and optionally subset of features (columns). During tree construction, split
 # candidates are additionally chosen on a sample of remaining features.
 #
-# .. code-block::
+# .. code-block:: text
 #
 #   For example, given a feature matrix with features [a,b,c,d]
 #   and the following two trees, M (the output) would look as follows:
 #
 #   (L1)          |a<7|                   |d<5|
-#                /     \                 /     \
+#                /     \\                 /     \\
 #   (L2)     |c<3|     |b<4|         |a<7|     P3:2
-#            /   \     /   \         /   \
+#            /   \\     /   \\         /  \\
 #   (L3)   P1:2 P2:1 P3:1 P4:2     P1:2 P2:1
+#
 #   --> M :=
 #   [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2],  (1st tree)
 #    [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]]  (2nd tree)
 
@@ -51,6 +51,7 @@
 # S              Matrix holding the shapley values along the cols, one row per instance.
 # expected       Double holding the average prediction of all instances.
 # -----------------------------------------------------------------------------
+
 s_shapExplainer = function(String model_function, list[unknown] model_args, Matrix[Double] x_instances,
     Matrix[Double] X_bg, Integer n_permutations = 10, Integer n_samples = 100, Integer remove_non_var=0,
     Matrix[Double] partitions=as.matrix(-1), Integer seed = -1, Integer verbose = 0)
 
@@ -21,6 +21,17 @@
 
 # This function cleans top-K item (where K is given as input)for a given list of users.
 # metaData[3, ncol(X)] : metaData[1] stores mask, metaData[2] stores schema, metaData[3] stores FD mask
+#
+# INPUT:
+#-------------------------------------------------------------------------------
+# TODO  TODO
+#-------------------------------------------------------------------------------
+#
+# OUTPUT:
+#-------------------------------------------------------------------------------
+# TODO  TODO
+#-------------------------------------------------------------------------------
+
 
 source("scripts/pipelines/scripts/utils.dml") as utils;
 source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`# and actual labels. We return both the counts and relative frequency`
`24`	`24`	`# (normalized by sum of true labels)`
`25`	`25`	`#`
`26`		`-# .. code-block::`
	`26`	`+# .. code-block:: text`
`27`	`27`	`#`
`28`	`28`	`# True Labels`
`29`	`29`	`# 1 2`
Original file line number	Diff line number	Diff line change
`@@ -30,9 +30,9 @@`
`30`	`30`	`# and the following trees, M would look as follows:`
`31`	`31`	`#`
`32`	`32`	`# (L1) \|d<5\|`
`33`		`-# / \`
	`33`	`+# / \\`
`34`	`34`	`# (L2) P1:2 \|a<7\|`
`35`		`-# / \`
	`35`	`+# / \\`
`36`	`36`	`# (L3) P2:2 P3:1`
`37`	`37`	`#`
`38`	`38`	`# --> M :=`
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@`
`58`	`58`	`# the product quantization. Only relevant when space_decomp = TRUE.`
`59`	`59`	`# ------------------------------------------------------------------------------------------`
`60`	`60`
`61`		`-m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, Integer runs = 10,`
	`61`	`+m_quantizeByCluster = function(Matrix[Double] X, Integer M = 4, Integer k = 10, Integer runs = 10,`
`62`	`62`	`Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean space_decomp=FALSE, Integer seed = -1)`
`63`	`63`	`return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R)`
`64`	`64`	`{`