ArthurLeroy
diff --git a/‎DESCRIPTION
Lines changed: 11 additions & 3 deletions b/‎DESCRIPTION
Lines changed: 11 additions & 3 deletions
diff --git a/‎NAMESPACE
Lines changed: 3 additions & 0 deletions b/‎NAMESPACE
Lines changed: 3 additions & 0 deletions
diff --git a/‎NEWS.md
Lines changed: 26 additions & 10 deletions b/‎NEWS.md
Lines changed: 26 additions & 10 deletions
diff --git a/‎R/data.R
Lines changed: 37 additions & 0 deletions b/‎R/data.R
Lines changed: 37 additions & 0 deletions
diff --git a/‎R/elbos.R
Lines changed: 39 additions & 21 deletions b/‎R/elbos.R
Lines changed: 39 additions & 21 deletions
diff --git a/‎R/em-magma.R
Lines changed: 31 additions & 12 deletions b/‎R/em-magma.R
Lines changed: 31 additions & 12 deletions
@@ -1,12 +1,15 @@
 Package: MagmaClustR
 Title: Clustering and Prediction using Multi-Task Gaussian Processes with
     Common Mean
-Version: 1.0.1
+Version: 1.1.0
 Authors@R: c(
-    person("Arthur", "Leroy", , "arthur.leroy.pro@gmail.com", role = c("aut", "cre"),
+    person("Arthur", "Leroy", , "arthur.leroy.pro@gmail.com",
+           role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-0806-8934")),
+    person("Pierre", "Latouche", , "pierre.latouche@gmail.com", role = "aut"),
     person("Pierre", "Pathé", , "pathepierre@gmail.com", role = "ctb"),
-    person("Pierre", "Latouche", , "pierre.latouche@gmail.com", role = "aut")
+    person("Alexia", "Grenouilla", , "grenouil@insa-toulouse.fr", role = "ctb"),
+    person("Hugo", "Lelievre", , "lelievre@insa-toulouse.fr", role = "ctb")
   )
 Description: An implementation for the multi-task Gaussian processes with common 
     mean framework. Two main algorithms, called 'Magma' and 'MagmaClust', 
@@ -36,6 +39,8 @@ Imports:
     magrittr,
     methods,
     mvtnorm,
+    plyr,
+    purrr,
     Rcpp,
     rlang,
     stats,
@@ -45,6 +50,7 @@ Imports:
 Suggests: 
     gganimate,
     gifski,
+    gridExtra,
     knitr,
     plotly,
     png,
@@ -57,3 +63,5 @@ Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.0
+Depends: 
+    R (>= 2.10)
@@ -2,6 +2,7 @@
 
 export("%>%")
 export(data_allocate_cluster)
+export(expand_grid_inputs)
 export(hp)
 export(hyperposterior)
 export(hyperposterior_clust)
@@ -17,6 +18,8 @@ export(pred_gp)
 export(pred_magma)
 export(pred_magmaclust)
 export(proba_max_cluster)
+export(regularise_data)
+export(regularize_data)
 export(sample_gp)
 export(select_nb_cluster)
 export(simu_db)
 
@@ -1,20 +1,36 @@
 # MagmaClustR (development version)
 
+# MagmaClustR 1.1.0
+
+## Major
+* Provide 4 vignettes explaining in details how the different features of MagmaClustR work on practical examples. 
+* Implement expand_grid_inputs() to help creating customised n-dimensional input
+grids on which to evaluate the GP.
+* Implement regularize_data() to project a dataset on a specific input grid,
+(possibly to control the size of the resulting covariance matrices and the associated running time).
+* Add an internal 'Reference' column to datasets, to provide an adequate identifier for multidimensional inputs.
+* Implement a new version of simu_db() to generate more realistic 2-D datasets.
+
+## Minor
+* Round inputs to 6 significant digits to avoid numerical errors.
+* Generalise the creation of a grid in any dimension when 'grid_inputs' is not
+specified in the prediction functions.
 
 # MagmaClustR 1.0.1
 
 ## Major
-* Remove the package 'optimr' dependency and switch to base 'optim()' function
-* Increase convergence tolerance in 'optim()', which was too slow
+*Remove the package 'optimr' dependency and switch to base 'optim()' function
+*Increase convergence tolerance in 'optim()', which was too slow
 
 ## Minor
-* Fix the warnings about the absolute value function in the Cpp code
-* Remove error message in 'train_magmaclust()' when common_hp_k = FALSE
-* Change the default intervals for hyper-parameters in 'simu_db()'
-* Automatically remove rows with missing data
-* Change position of the 'grid_inputs' argument in prediction functions
-* Remove the internal functions from the index documentation 
-* Fix 'ID' in hyperposterior() and hyperposterior_clust() when not character
+*Fix the warnings about the absolute value function in the Cpp code
+*Remove error message in 'train_magmaclust()' when common_hp_k = FALSE
+*Change the default intervals for hyper-parameters in 'simu_db()'
+*Automatically remove rows with missing data
+*Change position of the 'grid_inputs' argument in prediction functions
+*Remove the internal functions from the index documentation
+*Fix 'ID' in hyperposterior() and hyperposterior_clust() when not character
+
 
 # MagmaClustR 1.0.0
-* Initial release
+Initial release
@@ -0,0 +1,37 @@
+#' French swimmers performances data on 100m freestyle events
+#'
+#' A subset of data from reported performances of French swimmers during
+#' 100m freestyle competitions between 2002 and 2016. See
+#' https://link.springer.com/article/10.1007/s10994-022-06172-1 and
+#' https://www.mdpi.com/2076-3417/8/10/1766 for dedicated description and
+#' analysis.
+#'
+#' @format ## `swimmers`
+#' A data frame with 76,832 rows and 4 columns:
+#' \describe{
+#'   \item{ID}{Indentifying number associated to each swimmer}
+#'   \item{Input}{Age in years}
+#'   \item{Output}{Performance in seconds on a 100m freestyle event}
+#'   \item{Gender}{Competition gender}
+#' }
+#' @source <https://ffn.extranat.fr/webffn/competitions.php?idact=nat>
+"swimmers"
+
+#' Weight follow-up data of children in Singapore
+#'
+#' A subset of data from the GUSTO project (https://www.gusto.sg/) collecting
+#' the weight over time of several children in Singapore.
+#' See https://arxiv.org/abs/2011.07866 for dedicated description and
+#' analysis.
+#'
+#' @format ## `weight`
+#' A data frame with 3,629 rows and 4 columns:
+#' \describe{
+#'   \item{ID}{Indentifying number associated to each child}
+#'   \item{sex}{Biological gender}
+#'   \item{Input}{Age in months}
+#'   \item{Output}{Weight in kilograms}
+#' }
+#' @source <https://gustodatavault.sg/>
+"weight"
+
@@ -15,13 +15,24 @@
 #'
 #' @examples
 #' TRUE
-elbo_clust_multi_GP <- function(hp, db, hyperpost, kern, pen_diag) {
+elbo_clust_multi_GP <- function(hp,
+                                db,
+                                hyperpost,
+                                kern,
+                                pen_diag) {
+
   names_k <- hyperpost$mean %>% names()
-  t_i <- db$Input
+  t_i <- db$Reference
   y_i <- db$Output
   i <- unique(db$ID)
 
-  inv <- kern_to_inv(t_i, kern, hp, pen_diag)
+  if("ID" %in% names(db)){
+    inputs <- db %>% dplyr::select(-.data$Output, -.data$ID)
+  } else{
+    inputs <- db %>% dplyr::select(-.data$Output)
+  }
+
+  inv <- kern_to_inv(inputs, kern, hp, pen_diag)
 
   ## classic Gaussian centred log likelihood
   LL_norm <- -dmnorm(y_i, rep(0, length(y_i)), inv, log = T)
@@ -35,12 +46,12 @@ elbo_clust_multi_GP <- function(hp, db, hyperpost, kern, pen_diag) {
       dplyr::filter(.data$ID == i) %>%
       dplyr::pull(k)
     mean_mu_k <- hyperpost$mean[[k]] %>%
-      dplyr::filter(.data$Input %in% t_i) %>%
+      dplyr::filter(.data$Reference %in% t_i) %>%
       dplyr::pull(.data$Output)
     corr1 <- corr1 + tau_i_k * mean_mu_k
     corr2 <- corr2 + tau_i_k *
       (mean_mu_k %*% t(mean_mu_k) +
-        hyperpost$cov[[k]][as.character(t_i), as.character(t_i)])
+         hyperpost$cov[[k]][as.character(t_i), as.character(t_i)])
   }
 
   (LL_norm - y_i %*% inv %*% corr1 + 0.5 * sum(inv * corr2)) %>% return()
@@ -67,21 +78,22 @@ elbo_clust_multi_GP <- function(hp, db, hyperpost, kern, pen_diag) {
 #'
 #' @examples
 #' TRUE
-elbo_GP_mod_common_hp_k <- function(
-  hp,
-  db,
-  mean,
-  kern,
-  post_cov,
-  pen_diag
-  ) {
+elbo_GP_mod_common_hp_k <- function( hp,
+                                     db,
+                                     mean,
+                                     kern,
+                                     post_cov,
+                                     pen_diag) {
 
   list_ID_k <- names(db)
-  # t_k = db[[1]] %>% dplyr::pull(.data$Input)
-  t_k <- db[[1]] %>%
-    dplyr::pull(.data$Input)
 
-  inv <- kern_to_inv(t_k, kern, hp, pen_diag)
+  if("ID" %in% names(db)){
+    inputs <- db[[1]] %>% dplyr::select(-.data$Output, -.data$ID)
+  } else{
+    inputs <- db[[1]] %>% dplyr::select(-.data$Output)
+  }
+
+  inv <- kern_to_inv(inputs, kern, hp, pen_diag)
 
   LL_norm <- 0
   cor_term <- 0
@@ -114,7 +126,12 @@ elbo_GP_mod_common_hp_k <- function(
 #'
 #' @examples
 #' TRUE
-elbo_clust_multi_GP_common_hp_i <- function(hp, db, hyperpost, kern, pen_diag) {
+elbo_clust_multi_GP_common_hp_i <- function(hp,
+                                            db,
+                                            hyperpost,
+                                            kern,
+                                            pen_diag) {
+
   names_k <- hyperpost$mean %>% names()
 
   sum_i <- 0
@@ -123,7 +140,7 @@ elbo_clust_multi_GP_common_hp_i <- function(hp, db, hyperpost, kern, pen_diag) {
     ## Extract the i-th specific reference Input
     input_i <- db %>%
       dplyr::filter(.data$ID == i) %>%
-      dplyr::pull(.data$Input)
+      dplyr::pull(.data$Reference)
     ## Extract the i-th specific inputs (reference + covariates)
     inputs_i <- db %>%
       dplyr::filter(.data$ID == i) %>%
@@ -148,7 +165,7 @@ elbo_clust_multi_GP_common_hp_i <- function(hp, db, hyperpost, kern, pen_diag) {
         dplyr::filter(.data$ID == i) %>%
         dplyr::pull(k)
       mean_mu_k <- hyperpost$mean[[k]] %>%
-        dplyr::filter(.data$Input %in% input_i) %>%
+        dplyr::filter(.data$Reference %in% input_i) %>%
         dplyr::pull(.data$Output)
       corr1 <- corr1 + tau_i_k * mean_mu_k
       corr2 <- corr2 + tau_i_k *
@@ -216,7 +233,7 @@ elbo_monitoring_VEM <- function(hp_k,
   floop2 <- function(i) {
     t_i <- db %>%
       dplyr::filter(.data$ID == i) %>%
-      dplyr::pull(.data$Input)
+      dplyr::pull(.data$Reference)
 
     elbo_clust_multi_GP(
       hp_i[hp_i$ID == i, ],
@@ -257,6 +274,7 @@ elbo_monitoring_VEM <- function(hp_k,
 
     return(sum_tau + det)
   }
+
   sum_corr_k <- sapply(names(m_k), floop3) %>% sum()
 
   return(-sum_ll_k - sum_ll_i + sum_corr_k)
 
@@ -29,10 +29,14 @@ e_step <- function(db,
                    hp_0,
                    hp_i,
                    pen_diag) {
-  all_input <- unique(db$Input) %>% sort()
+  ## Extract the union of all reference inputs provided in the training data
+  all_inputs <- db %>%
+    dplyr::select(-.data$ID, -.data$Output) %>%
+    unique() %>%
+    dplyr::arrange(.data$Reference)
 
   ## Compute all the inverse covariance matrices
-  inv_0 <- kern_to_inv(all_input, kern_0, hp_0, pen_diag)
+  inv_0 <- kern_to_inv(all_inputs, kern_0, hp_0, pen_diag)
   list_inv_i <- list_kern_to_inv(db, kern_i, hp_i, pen_diag)
   ## Create a named list of Output values for all individuals
   list_output_i <- base::split(db$Output, list(db$ID))
@@ -50,8 +54,12 @@ e_step <- function(db,
 
   post_cov <- post_inv %>%
     chol_inv_jitter(pen_diag = pen_diag) %>%
-    `rownames<-`(all_input) %>%
-    `colnames<-`(all_input)
+    `rownames<-`(all_inputs %>%
+                   dplyr::pull(.data$Reference)
+    ) %>%
+    `colnames<-`(all_inputs %>%
+                   dplyr::pull(.data$Reference)
+    )
   ##############################################
 
   ## Update the posterior mean ##
@@ -71,9 +79,8 @@ e_step <- function(db,
   ##############################################
 
   ## Format the mean parameter of the hyper-posterior distribution
-  tib_mean <- tibble::tibble(
-    "Input" = all_input,
-    "Output" = post_mean
+  tib_mean <- tibble::tibble(all_inputs,
+                             "Output" = post_mean
   )
   list(
     "mean" = tib_mean,
@@ -117,8 +124,17 @@ e_step <- function(db,
 #'
 #' @examples
 #' TRUE
-m_step <- function(db, m_0, kern_0, kern_i, old_hp_0, old_hp_i,
-                   post_mean, post_cov, common_hp, pen_diag) {
+m_step <- function(db,
+                   m_0,
+                   kern_0,
+                   kern_i,
+                   old_hp_0,
+                   old_hp_i,
+                   post_mean,
+                   post_cov,
+                   common_hp,
+                   pen_diag) {
+
   list_ID <- unique(db$ID)
   list_hp_0 <- old_hp_0 %>% names()
   list_hp_i <- old_hp_i %>%
@@ -182,10 +198,10 @@ m_step <- function(db, m_0, kern_0, kern_i, old_hp_0, old_hp_i,
       ## Extract the i-th specific inputs
       input_i <- db %>%
         dplyr::filter(.data$ID == i) %>%
-        dplyr::pull(.data$Input)
+        dplyr::pull(.data$Reference)
       ## Extract the mean values associated with the i-th specific inputs
       post_mean_i <- post_mean %>%
-        dplyr::filter(.data$Input %in% input_i) %>%
+        dplyr::filter(.data$Reference %in% input_i) %>%
         dplyr::pull(.data$Output)
       ## Extract the covariance values associated with the i-th specific inputs
       post_cov_i <- post_cov[as.character(input_i), as.character(input_i)]
@@ -214,7 +230,10 @@ m_step <- function(db, m_0, kern_0, kern_i, old_hp_0, old_hp_i,
         tibble::as_tibble_row() %>%
         return()
     }
-    new_hp_i <- sapply(list_ID, floop, simplify = FALSE, USE.NAMES = TRUE) %>%
+    new_hp_i <- sapply(list_ID,
+                       floop,
+                       simplify = FALSE,
+                       USE.NAMES = TRUE) %>%
       tibble::enframe(name = "ID") %>%
       tidyr::unnest(cols = .data$value)
   }