Changes for experimenting with simulated data sets.

asrivast28 · asrivast28 · commit b70b1e3d9823 · 2020-07-29T22:34:17.000-04:00
diff --git a/scripts/discretize.py b/scripts/discretize.py
@@ -101,9 +101,20 @@ def write_dataset(dataset, name, sep, colobs, varnames, indices):
     '''
     Write the dataset as a CSV file.
     '''
+    header = False
+    index = False
     if colobs:
         dataset = dataset.T
-    dataset.to_csv(name, sep=sep, header=varnames, index=indices)
+        if indices:
+            header = True
+        if varnames:
+            index = True
+    else:
+        if varnames:
+            header = True
+        if indices:
+            index = True
+    dataset.to_csv(name, sep=sep, header=header, index=index)
 
 
 def main():
diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py
@@ -47,7 +47,15 @@
     ('complete'   , ('data/athaliana/athaliana_complete_discretized.tsv', 18380, 16838, ' ', True, True, True)),
     ])
 
-all_datasets = OrderedDict(list(small_datasets.items()) + list(big_datasets.items()))
+simulated_datasets = OrderedDict([
+    #(name, (-f, -n, -m, -s, -c, -v, -i)),
+    ('s1' , ('data/simulated/n30000_p0.00005_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
+    ('s2' , ('data/simulated/n30000_p0.0001_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
+    ('s3' , ('data/simulated/n30000_p0.0005_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
+    ('s4' , ('data/simulated/n30000_p0.001_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
+    ])
+
+all_datasets = OrderedDict(list(small_datasets.items()) + list(big_datasets.items()) + list(simulated_datasets.items()))
 
 all_algorithms = [
     'gs',
diff --git a/scripts/simulate_bn.R b/scripts/simulate_bn.R
@@ -0,0 +1,89 @@
+#!/usr/bin/env Rscript
+
+##
+# @file simulate_bn.R
+# @brief Script for simulating a Bayesian network using pcalg 
+# @author Ankit Srivastava <asrivast@gatech.edu>
+#
+# Copyright 2020 Georgia Institute of Technology
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+library('pcalg')
+library('optparse')
+
+
+if (!exists('argv')) {
+        argv = commandArgs(trailing=TRUE)
+}
+
+parser <- OptionParser()
+parser <- add_option(parser, c('--seed'), type='integer', help='PRNG seed.')
+parser <- add_option(parser, c('--nvars', '-n'), type='integer', help='Number of variables in the simulated network.')
+parser <- add_option(parser, c('--prob', '-p'), type='double', default=0.05, help='Threshold p-value used for generating the graph.')
+parser <- add_option(parser, c('--mbsize'), action='store_true', default=FALSE, help='Print out the average MB size.')
+parser <- add_option(parser, c('--bn', '-b'), type='character', help='Name of the dot file to which the network is written.')
+parser <- add_option(parser, c('--nobs', '-m'), type='integer', help='Number of observations in the simulated dataset.')
+parser <- add_option(parser, c('--datafile', '-d'), type='character', help='Name of the file to which dataset is written.')
+parser <- add_option(parser, c('--colobs', '-c'), action='store_true', default=FALSE, help='The file contains observations in columns.')
+parser <- add_option(parser, c('--sep', '-s'), type='character', default=' ', help='Delimiting character in the dataset file.')
+args <- parse_args(parser, args=argv)
+
+
+if (!is.null(args$seed)) {
+        set.seed(args$seed)
+}
+
+tGenerate <- proc.time()
+nodes <- c()
+for (v in seq(1, args$nvars)) {
+        nodes <- c(nodes, paste('V', v, sep=''))
+}
+# Graph of type graphNEL
+dag <- randomDAG(args$nvars, prob=args$prob, V=nodes)
+show(dag)
+tGenerate <- proc.time() - tGenerate
+cat('Time taken in generating the network:', tGenerate['elapsed'], 'sec\n')
+
+if (args$mbsize) {
+        cat('Using bnlearn from', find.package('bnlearn'), '\n')
+        library('bnlearn')
+        tMBSize <- proc.time()
+        # Convert to BN
+        bn <- as.bn(dag)
+        avgmb <- mean(sapply(nodes, function(n) { length(bn$nodes[[n]]$mb) }))
+        cat('Average MB size is', avgmb, '\n')
+        tMBSize <- proc.time() - tMBSize
+        cat('Time taken in getting the MB sizes:', tMBSize['elapsed'], 'sec\n')
+}
+
+if (!is.null(args$bn)) {
+        tWrite <- proc.time()
+        write.dot(args$bn, bn)
+        tWrite <- proc.time() - tWrite
+        cat('Time taken in writing the network:', tWrite['elapsed'], 'sec\n')
+}
+
+if (!is.null(args$nobs)) {
+        tData <- proc.time()
+        data <- rmvDAG(args$nobs, dag, use.node.names=TRUE)
+        if (!args$colobs) {
+                data <- t(data)
+        }
+        tData <- proc.time() - tData
+        cat('Time taken in getting the data:', tData['elapsed'], 'sec\n')
+        tWrite <- proc.time()
+        write.table(data, file=args$datafile, sep=args$sep, row.names=!args$colobs, col.names=args$colobs)
+        tWrite <- proc.time() - tWrite
+        cat('Time taken in writing the dataset:', tWrite['elapsed'], 'sec\n')
+}