(dsl): Support Sampler aggregation (#649)

pavlovicisidora · web-flow · commit 151bf73e92c6 · 2025-07-01T15:03:17.000+02:00
diff --git a/modules/integration/src/test/scala/zio/elasticsearch/HttpExecutorSpec.scala b/modules/integration/src/test/scala/zio/elasticsearch/HttpExecutorSpec.scala
@@ -33,7 +33,16 @@ import zio.elasticsearch.query.sort.SortOrder._
 import zio.elasticsearch.query.sort.SourceType.NumberType
 import zio.elasticsearch.query.{Distance, FunctionScoreBoostMode, FunctionScoreFunction, InnerHits}
 import zio.elasticsearch.request.{CreationOutcome, DeletionOutcome}
-import zio.elasticsearch.result.{FilterAggregationResult, Item, MaxAggregationResult, UpdateByQueryResult}
+import zio.elasticsearch.result.{
+  FilterAggregationResult,
+  Item,
+  MaxAggregationResult,
+  SamplerAggregationResult,
+  SumAggregationResult,
+  TermsAggregationBucketResult,
+  TermsAggregationResult,
+  UpdateByQueryResult
+}
 import zio.elasticsearch.script.{Painless, Script}
 import zio.json.ast.Json.{Arr, Str}
 import zio.schema.codec.JsonCodec
@@ -408,6 +417,74 @@ object HttpExecutorSpec extends IntegrationSpec {
             Executor.execute(ElasticRequest.createIndex(firstSearchIndex)),
             Executor.execute(ElasticRequest.deleteIndex(firstSearchIndex)).orDie
           ),
+          test("aggregate using sampler aggregation with sum and terms sub aggregations") {
+            (
+              "sampler_agg",
+              SamplerAggregationResult(
+                docCount = 4,
+                subAggregations = Map(
+                  "total_sum_field"   -> SumAggregationResult(value = 50.0),
+                  "string_categories" -> TermsAggregationResult(
+                    docErrorCount = 0,
+                    sumOtherDocCount = 0,
+                    buckets = Chunk(
+                      TermsAggregationBucketResult(key = "abc", docCount = 1, subAggregations = Map.empty),
+                      TermsAggregationBucketResult(key = "def", docCount = 1, subAggregations = Map.empty),
+                      TermsAggregationBucketResult(key = "ghi", docCount = 1, subAggregations = Map.empty),
+                      TermsAggregationBucketResult(key = "jkl", docCount = 1, subAggregations = Map.empty)
+                    )
+                  )
+                )
+              )
+            )
+            checkOnce(genDocumentId, genTestDocument, genDocumentId, genTestDocument, genDocumentId, genTestDocument) {
+              (docIdA, docA, docIdB, docB, docIdC, docC) =>
+                for {
+                  _        <- Executor.execute(ElasticRequest.deleteByQuery(firstSearchIndex, matchAll))
+                  documentA = docA.copy(stringField = "abc", intField = 10)
+                  documentB = docB.copy(stringField = "def", intField = 20)
+                  documentC = docC.copy(stringField = "ghi", intField = 15)
+                  _        <- Executor.execute(ElasticRequest.upsert[TestDocument](firstSearchIndex, docIdA, documentA))
+                  _        <- Executor.execute(ElasticRequest.upsert[TestDocument](firstSearchIndex, docIdB, documentB))
+                  _        <- Executor.execute(
+                         ElasticRequest.upsert[TestDocument](firstSearchIndex, docIdC, documentC).refreshTrue
+                       )
+                  aggregation = samplerAggregation(
+                                  "sampler_agg",
+                                  sumAggregation("total_sum_field", TestDocument.intField)
+                                ).withSubAgg(termsAggregation("string_categories", TestDocument.stringField.keyword))
+                                  .maxDocumentsPerShard(100)
+                  aggsRes <-
+                    Executor
+                      .execute(ElasticRequest.aggregate(selectors = firstSearchIndex, aggregation = aggregation))
+                      .aggregations
+                      .map(_.head)
+
+                  expectedResult =
+                    (
+                      "sampler_agg",
+                      SamplerAggregationResult(
+                        docCount = 3,
+                        subAggregations = Map(
+                          "total_sum_field"   -> SumAggregationResult(value = 45.0),
+                          "string_categories" -> TermsAggregationResult(
+                            docErrorCount = 0,
+                            sumOtherDocCount = 0,
+                            buckets = Chunk(
+                              TermsAggregationBucketResult(key = "abc", docCount = 1, subAggregations = Map.empty),
+                              TermsAggregationBucketResult(key = "def", docCount = 1, subAggregations = Map.empty),
+                              TermsAggregationBucketResult(key = "ghi", docCount = 1, subAggregations = Map.empty)
+                            )
+                          )
+                        )
+                      )
+                    )
+                } yield assert(aggsRes)(equalTo(expectedResult))
+            }
+          } @@ around(
+            Executor.execute(ElasticRequest.createIndex(firstSearchIndex)),
+            Executor.execute(ElasticRequest.deleteIndex(firstSearchIndex)).orDie
+          ),
           test("aggregate using stats aggregation") {
             checkOnce(genDocumentId, genTestDocument, genDocumentId, genTestDocument, genDocumentId, genTestDocument) {
               (firstDocumentId, firstDocument, secondDocumentId, secondDocument, thirdDocumentId, thirdDocument) =>
@@ -795,6 +872,57 @@ object HttpExecutorSpec extends IntegrationSpec {
             Executor.execute(ElasticRequest.createIndex(firstSearchIndex)),
             Executor.execute(ElasticRequest.deleteIndex(firstSearchIndex)).orDie
           ),
+          test("search using sampler aggregation") {
+            val expectedAggResult = SamplerAggregationResult(
+              docCount = 2,
+              subAggregations = Map(
+                "sampled_strings" -> TermsAggregationResult(
+                  docErrorCount = 0,
+                  sumOtherDocCount = 0,
+                  buckets = Chunk(
+                    TermsAggregationBucketResult(key = "zio", docCount = 1, subAggregations = Map.empty),
+                    TermsAggregationBucketResult(key = "zio-elasticsearch", docCount = 1, subAggregations = Map.empty)
+                  )
+                )
+              )
+            )
+            checkOnce(genDocumentId, genTestDocument, genDocumentId, genTestDocument, genDocumentId, genTestDocument) {
+              (docIdA, docA, docIdB, docB, docIdC, docC) =>
+                val documentA          = docA.copy(stringField = "zio")
+                val documentB          = docB.copy(stringField = "elasticsearch")
+                val documentC          = docC.copy(stringField = "zio-elasticsearch")
+                val expectedSearchDocs = Chunk(documentA, documentC)
+                for {
+                  _ <- Executor.execute(ElasticRequest.deleteByQuery(firstSearchIndex, matchAll))
+                  _ <- Executor.execute(ElasticRequest.upsert[TestDocument](firstSearchIndex, docIdA, documentA))
+                  _ <- Executor.execute(ElasticRequest.upsert[TestDocument](firstSearchIndex, docIdB, documentB))
+                  _ <- Executor.execute(
+                         ElasticRequest.upsert[TestDocument](firstSearchIndex, docIdC, documentC).refreshTrue
+                       )
+                  searchQuery = matches(TestDocument.stringField, "zio")
+                  aggregation = samplerAggregation(
+                                  "sampler_agg",
+                                  termsAggregation("sampled_strings", TestDocument.stringField.keyword)
+                                )
+                                  .maxDocumentsPerShard(2)
+                  res <- Executor.execute(
+                           ElasticRequest
+                             .search(
+                               selectors = firstSearchIndex,
+                               query = searchQuery,
+                               aggregation = aggregation
+                             )
+                         )
+                  docs       <- res.documentAs[TestDocument]
+                  samplerAgg <- res.aggregation("sampler_agg")
+                } yield assert(docs.length)(equalTo(2)) &&
+                  assert(docs.toSet)(equalTo(expectedSearchDocs.toSet)) &&
+                  assert(samplerAgg)(isSome(equalTo(expectedAggResult)))
+            }
+          } @@ around(
+            Executor.execute(ElasticRequest.createIndex(firstSearchIndex)),
+            Executor.execute(ElasticRequest.deleteIndex(firstSearchIndex)).orDie
+          ),
           test(
             "search using match all query with terms aggregations, nested max aggregation and nested bucketSelector aggregation"
           ) {
diff --git a/modules/library/src/main/scala/zio/elasticsearch/ElasticAggregation.scala b/modules/library/src/main/scala/zio/elasticsearch/ElasticAggregation.scala
@@ -345,6 +345,26 @@ object ElasticAggregation {
   final def percentilesAggregation(name: String, field: String): PercentilesAggregation =
     Percentiles(name = name, field = field, percents = Chunk.empty, missing = None)
 
+  /**
+   * Constructs an instance of [[zio.elasticsearch.aggregation.SamplerAggregation]] using the specified parameters.
+   *
+   * @param name
+   *   the name of the aggregation
+   * @param agg
+   *   the first required sub-aggregation to be included in the sampler
+   * @param aggs
+   *   additional sub-aggregations to be included in the sampler
+   * @return
+   *   an instance of [[zio.elasticsearch.aggregation.SamplerAggregation]] that represents sampler aggregation to be
+   *   performed. This aggregation has a default `shard_size` of `100` documents per shard.
+   */
+  final def samplerAggregation(
+    name: String,
+    agg: SingleElasticAggregation,
+    aggs: SingleElasticAggregation*
+  ): SamplerAggregation =
+    Sampler(name = name, shardSizeValue = 100, subAggregations = agg +: aggs)
+
   /**
    * Constructs a type-safe instance of [[zio.elasticsearch.aggregation.StatsAggregation]] using the specified
    * parameters.
diff --git a/modules/library/src/main/scala/zio/elasticsearch/aggregation/Aggregations.scala b/modules/library/src/main/scala/zio/elasticsearch/aggregation/Aggregations.scala
@@ -351,6 +351,40 @@ private[elasticsearch] final case class Percentiles(
   }
 }
 
+sealed trait SamplerAggregation extends SingleElasticAggregation with WithSubAgg[SamplerAggregation] {
+
+  /**
+   * Sets the `shard_size` parameter for the [[zio.elasticsearch.aggregation.SamplerAggregation]]. This parameter
+   * controls the maximum number of documents to be returned per shard.
+   *
+   * @param value
+   *   the maximum number of documents per shard
+   * @return
+   *   an instance of the [[zio.elasticsearch.aggregation.SamplerAggregation]] enriched with the `shard_size` parameter.
+   */
+  def maxDocumentsPerShard(value: Int): SamplerAggregation
+}
+
+private[elasticsearch] final case class Sampler(
+  name: String,
+  shardSizeValue: Int,
+  subAggregations: Seq[SingleElasticAggregation]
+) extends SamplerAggregation {
+  self =>
+  def maxDocumentsPerShard(value: Int): SamplerAggregation =
+    self.copy(shardSizeValue = value)
+
+  def withSubAgg(aggregation: SingleElasticAggregation): SamplerAggregation =
+    self.copy(subAggregations = aggregation +: subAggregations)
+
+  private[elasticsearch] def toJson: Json = {
+    val samplerParamsContent: Obj = Obj("sampler" -> Obj("shard_size" -> shardSizeValue.toJson))
+    val subAggsJson: Obj          = Obj("aggs" -> subAggregations.map(_.toJson).reduce(_ merge _))
+
+    Obj(name -> (samplerParamsContent merge subAggsJson))
+  }
+}
+
 sealed trait StatsAggregation extends SingleElasticAggregation with HasMissing[StatsAggregation] with WithAgg
 
 private[elasticsearch] final case class Stats(name: String, field: String, missing: Option[Double])
diff --git a/modules/library/src/main/scala/zio/elasticsearch/executor/response/AggregationResponse.scala b/modules/library/src/main/scala/zio/elasticsearch/executor/response/AggregationResponse.scala
@@ -73,9 +73,8 @@ object AggregationResponse {
       case FilterAggregationResponse(docCount, subAggregations) =>
         FilterAggregationResult(
           docCount = docCount,
-          subAggregations = subAggregations.fold(Map[String, AggregationResult]())(_.map { case (key, response) =>
-            (key, toResult(response))
-          })
+          subAggregations =
+            subAggregations.map(_.map { case (key, response) => (key, toResult(response)) }).getOrElse(Map.empty)
         )
       case MaxAggregationResponse(value) =>
         MaxAggregationResult(value)
@@ -87,6 +86,11 @@ object AggregationResponse {
         PercentileRanksAggregationResult(values)
       case PercentilesAggregationResponse(values) =>
         PercentilesAggregationResult(values)
+      case SamplerAggregationResponse(count, aggs) =>
+        SamplerAggregationResult(
+          docCount = count,
+          subAggregations = aggs.map(_.map { case (key, response) => (key, toResult(response)) }).getOrElse(Map.empty)
+        )
       case StatsAggregationResponse(count, min, max, avg, sum) =>
         StatsAggregationResult(count, min, max, avg, sum)
       case SumAggregationResponse(value) =>
@@ -99,9 +103,8 @@ object AggregationResponse {
             TermsAggregationBucketResult(
               docCount = b.docCount,
               key = b.key,
-              subAggregations = b.subAggregations.fold(Map[String, AggregationResult]())(_.map { case (key, response) =>
-                (key, toResult(response))
-              })
+              subAggregations =
+                b.subAggregations.map(_.map { case (key, response) => (key, toResult(response)) }).getOrElse(Map.empty)
             )
           )
         )
@@ -169,6 +172,8 @@ private[elasticsearch] case class BucketDecoder(fields: Chunk[(String, Json)]) e
             )
           case str if str.contains("percentiles#") =>
             Some(field -> PercentilesAggregationResponse(values = objFields("values").unsafeAs[Map[String, Double]]))
+          case str if str.contains("sampler#") =>
+            Some(field -> data.unsafeAs[SamplerAggregationResponse](SamplerAggregationResponse.decoder))
           case str if str.contains("stats#") =>
             Some(
               field -> StatsAggregationResponse(
@@ -212,6 +217,8 @@ private[elasticsearch] case class BucketDecoder(fields: Chunk[(String, Json)]) e
           (field.split("#")(1), data.asInstanceOf[PercentileRanksAggregationResponse])
         case str if str.contains("percentiles#") =>
           (field.split("#")(1), data.asInstanceOf[PercentilesAggregationResponse])
+        case str if str.contains("sampler#") =>
+          (field.split("#")(1), data.asInstanceOf[SamplerAggregationResponse])
         case str if str.contains("stats#") =>
           (field.split("#")(1), data.asInstanceOf[StatsAggregationResponse])
         case str if str.contains("sum#") =>
@@ -320,6 +327,23 @@ private[elasticsearch] object PercentilesAggregationResponse {
     DeriveJsonDecoder.gen[PercentilesAggregationResponse]
 }
 
+private[elasticsearch] final case class SamplerAggregationResponse(
+  @jsonField("doc_count")
+  docCount: Int,
+  subAggregations: Option[Map[String, AggregationResponse]] = None
+) extends AggregationResponse
+
+private[elasticsearch] object SamplerAggregationResponse {
+  implicit val decoder: JsonDecoder[SamplerAggregationResponse] = Obj.decoder.mapOrFail { case Obj(fields) =>
+    val bucketDecoder = BucketDecoder(fields)
+    val allFields     = bucketDecoder.allFields
+    val docCount      = allFields("doc_count").asInstanceOf[Int]
+    val subAggs       = bucketDecoder.subAggs
+
+    Right(SamplerAggregationResponse.apply(docCount, Option(subAggs).filter(_.nonEmpty)))
+  }
+}
+
 private[elasticsearch] final case class StatsAggregationResponse(
   count: Int,
   min: Double,
diff --git a/modules/library/src/main/scala/zio/elasticsearch/executor/response/SearchWithAggregationsResponse.scala b/modules/library/src/main/scala/zio/elasticsearch/executor/response/SearchWithAggregationsResponse.scala
@@ -92,6 +92,8 @@ private[elasticsearch] final case class SearchWithAggregationsResponse(
                       PercentileRanksAggregationResponse.decoder.decodeJson(data.toString).map(field.split("#")(1) -> _)
                     case str if str.contains("percentiles#") =>
                       PercentilesAggregationResponse.decoder.decodeJson(data.toString).map(field.split("#")(1) -> _)
+                    case str if str.contains("sampler#") =>
+                      SamplerAggregationResponse.decoder.decodeJson(data.toString).map(field.split("#")(1) -> _)
                     case str if str.contains("stats#") =>
                       StatsAggregationResponse.decoder.decodeJson(data.toString).map(field.split("#")(1) -> _)
                     case str if str.contains("sum#") =>
diff --git a/modules/library/src/main/scala/zio/elasticsearch/package.scala b/modules/library/src/main/scala/zio/elasticsearch/package.scala
@@ -156,6 +156,18 @@ package object elasticsearch extends IndexNameNewtype with IndexPatternNewtype w
     def asPercentilesAggregation(name: String): RIO[R, Option[PercentilesAggregationResult]] =
       aggregationAs[PercentilesAggregationResult](name)
 
+    /**
+     * Executes the [[ElasticRequest.SearchRequest]] or the [[ElasticRequest.SearchAndAggregateRequest]].
+     *
+     * @param name
+     *   the name of the aggregation to retrieve
+     * @return
+     *   a [[RIO]] effect that, when executed, will produce the aggregation as instance of
+     *   [[result.SamplerAggregationResult]].
+     */
+    def asSamplerAggregation(name: String): RIO[R, Option[SamplerAggregationResult]] =
+      aggregationAs[SamplerAggregationResult](name)
+
     /**
      * Executes the [[ElasticRequest.SearchRequest]] or the [[ElasticRequest.SearchAndAggregateRequest]].
      *
diff --git a/modules/library/src/main/scala/zio/elasticsearch/result/AggregationResult.scala b/modules/library/src/main/scala/zio/elasticsearch/result/AggregationResult.scala
@@ -65,6 +65,19 @@ final case class PercentileRanksAggregationResult private[elasticsearch] (values
 final case class PercentilesAggregationResult private[elasticsearch] (values: Map[String, Double])
     extends AggregationResult
 
+final case class SamplerAggregationResult private[elasticsearch] (
+  docCount: Int,
+  subAggregations: Map[String, AggregationResult]
+) extends AggregationResult {
+
+  def subAggregationAs[A <: AggregationResult](aggName: String): Either[DecodingException, Option[A]] =
+    subAggregations.get(aggName) match {
+      case Some(agg: A) => Right(Some(agg))
+      case Some(_)      => Left(DecodingException(s"Aggregation with name $aggName was not of type you provided."))
+      case None         => Right(None)
+    }
+}
+
 final case class StatsAggregationResult private[elasticsearch] (
   count: Int,
   min: Double,
diff --git a/modules/library/src/test/scala/zio/elasticsearch/ElasticAggregationSpec.scala b/modules/library/src/test/scala/zio/elasticsearch/ElasticAggregationSpec.scala