tpch partitions for 100g, 250g and diff max executors

Dhruv Garg · Dhruv Garg · commit e4007266dbce · 2025-02-20T16:09:41.000-05:00
diff --git a/data/tpch_partitioning_analysis.ipynb b/data/tpch_partitioning_analysis.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -73,7 +73,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -84,7 +84,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -103,7 +103,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -117,48 +117,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Input params to create buckets\n",
+    "json_path = \"/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/cloudlab/cloudlab_22query_tpch_profiles.json\"\n",
+    "bucket_size=8000\n",
+    "dataset_size=\"100g\"\n",
+    "max_executors=200\n",
+    "min_task_runtime_ms=12000\n",
+    "\n",
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=bucket_size, dataset_size=dataset_size, max_executors=max_executors, min_task_runtime_ms=min_task_runtime_ms)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 100g dataset, varying the executors: 75, 100, 200"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'easy': [],\n",
-       " 'medium': ['q1',\n",
-       "  'q2',\n",
-       "  'q6',\n",
-       "  'q11',\n",
-       "  'q12',\n",
-       "  'q13',\n",
-       "  'q14',\n",
-       "  'q15',\n",
-       "  'q16',\n",
-       "  'q19',\n",
-       "  'q20',\n",
-       "  'q22'],\n",
-       " 'hard': ['q3', 'q4', 'q5', 'q7', 'q8', 'q9', 'q10', 'q17', 'q18', 'q21']}"
+       "{'easy': ['q11', 'q13', 'q14', 'q15', 'q19', 'q20', 'q22'],\n",
+       " 'medium': ['q1', 'q2', 'q4', 'q6', 'q10', 'q12', 'q16', 'q17', 'q18'],\n",
+       " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q21']}"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# Input params to create buckets\n",
-    "json_path = \"/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/cloudlab/cloudlab_22query_tpch_profiles.json\"\n",
-    "bucket_size=8000\n",
-    "dataset_size=\"100g\"\n",
-    "max_executors=200\n",
-    "min_task_runtime_ms=12000\n",
-    "\n",
-    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=bucket_size, dataset_size=dataset_size, max_executors=max_executors, min_task_runtime_ms=min_task_runtime_ms)\n",
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=5500, dataset_size=\"100g\", max_executors=75, min_task_runtime_ms=12000)\n",
     "buckets"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -169,7 +174,7 @@
        " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q18', 'q21']}"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -181,80 +186,100 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n",
-       " 'medium': ['q1', 'q4', 'q6', 'q10', 'q12', 'q14', 'q15', 'q20'],\n",
+       "{'easy': ['q6', 'q11', 'q13', 'q19', 'q22'],\n",
+       " 'medium': ['q1', 'q2', 'q4', 'q10', 'q12', 'q14', 'q15', 'q16', 'q20'],\n",
        " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q17', 'q18', 'q21']}"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=7500, dataset_size=\"100g\", max_executors=100, min_task_runtime_ms=12000)\n",
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=10000, dataset_size=\"100g\", max_executors=200, min_task_runtime_ms=12000)\n",
     "buckets"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 250g dataset, varying the executors: 75, 100, 250"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'easy': ['q13', 'q16', 'q22'],\n",
-       " 'medium': ['q2', 'q6', 'q11', 'q12', 'q14', 'q15', 'q19', 'q20'],\n",
-       " 'hard': ['q1',\n",
-       "  'q3',\n",
-       "  'q4',\n",
-       "  'q5',\n",
-       "  'q7',\n",
-       "  'q8',\n",
-       "  'q9',\n",
-       "  'q10',\n",
-       "  'q17',\n",
-       "  'q18',\n",
-       "  'q21']}"
+       "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n",
+       " 'medium': ['q1', 'q6', 'q7', 'q10', 'q12', 'q14', 'q15', 'q20'],\n",
+       " 'hard': ['q3', 'q4', 'q5', 'q8', 'q9', 'q17', 'q18', 'q21']}"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=9000, dataset_size=\"250g\", max_executors=75, min_task_runtime_ms=12000)\n",
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=11500, dataset_size=\"250g\", max_executors=75, min_task_runtime_ms=12000)\n",
     "buckets"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n",
-       " 'medium': ['q1', 'q6', 'q7', 'q10', 'q12', 'q14', 'q15', 'q20'],\n",
-       " 'hard': ['q3', 'q4', 'q5', 'q8', 'q9', 'q17', 'q18', 'q21']}"
+       " 'medium': ['q1', 'q6', 'q10', 'q12', 'q14', 'q15', 'q20'],\n",
+       " 'hard': ['q3', 'q4', 'q5', 'q7', 'q8', 'q9', 'q17', 'q18', 'q21']}"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=11500, dataset_size=\"250g\", max_executors=75, min_task_runtime_ms=12000)\n",
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=15000, dataset_size=\"250g\", max_executors=100, min_task_runtime_ms=12000)\n",
+    "buckets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'easy': ['q1', 'q2', 'q6', 'q11', 'q13', 'q16', 'q22'],\n",
+       " 'medium': ['q4', 'q7', 'q10', 'q12', 'q14', 'q15', 'q19', 'q20'],\n",
+       " 'hard': ['q3', 'q5', 'q8', 'q9', 'q17', 'q18', 'q21']}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=15000, dataset_size=\"250g\", max_executors=200, min_task_runtime_ms=12000)\n",
     "buckets"
    ]
   },