@@ -24,7 +24,7 @@ import org.apache.paimon.spark.leafnode.PaimonLeafRunnableCommand
24
24
import org .apache .paimon .spark .schema .{PaimonMetadataColumn , SparkSystemColumns }
25
25
import org .apache .paimon .spark .schema .PaimonMetadataColumn .{FILE_PATH , FILE_PATH_COLUMN , ROW_INDEX , ROW_INDEX_COLUMN }
26
26
import org .apache .paimon .spark .util .{EncoderUtils , SparkRowUtils }
27
- import org .apache .paimon .table .FileStoreTable
27
+ import org .apache .paimon .table .{ FileStoreTable , SpecialFields }
28
28
import org .apache .paimon .table .sink .CommitMessage
29
29
import org .apache .paimon .types .RowKind
30
30
@@ -114,14 +114,10 @@ case class MergeIntoPaimonTable(
114
114
sparkSession,
115
115
filteredRelation,
116
116
remainDeletedRow = true ,
117
- metadataCols = metadataCols)
117
+ extraMetadataCols = metadataCols)
118
118
119
119
ds.cache()
120
120
try {
121
- val rowKindAttribute = ds.queryExecution.analyzed.output
122
- .find(attr => sparkSession.sessionState.conf.resolver(attr.name, ROW_KIND_COL ))
123
- .getOrElse(throw new RuntimeException (" Can not find _row_kind_ column." ))
124
-
125
121
// Step3: filter rows that should be marked as DELETED in Deletion Vector mode.
126
122
val dvDS = ds.where(
127
123
s " $ROW_KIND_COL = ${RowKind .DELETE .toByteValue} or $ROW_KIND_COL = ${RowKind .UPDATE_AFTER .toByteValue}" )
@@ -141,8 +137,10 @@ case class MergeIntoPaimonTable(
141
137
ds.unpersist()
142
138
}
143
139
} else {
144
- val touchedFilePathsSet = mutable.Set .empty[String ]
145
- val intersectionFilePaths = mutable.Set .empty[String ]
140
+ // Files need to be rewritten
141
+ val filePathsToRewritten = mutable.Set .empty[String ]
142
+ // Files need to be read, but not rewritten
143
+ val filePathsToRead = mutable.Set .empty[String ]
146
144
147
145
def hasUpdate (actions : Seq [MergeAction ]): Boolean = {
148
146
actions.exists {
@@ -159,39 +157,44 @@ case class MergeIntoPaimonTable(
159
157
}
160
158
161
159
if (hasUpdate(matchedActions)) {
162
- touchedFilePathsSet ++= findTouchedFiles0(" inner" )
160
+ filePathsToRewritten ++= findTouchedFiles0(" inner" )
163
161
} else if (notMatchedActions.nonEmpty) {
164
- intersectionFilePaths ++= findTouchedFiles0(" inner" )
162
+ filePathsToRead ++= findTouchedFiles0(" inner" )
165
163
}
166
164
167
165
if (hasUpdate(notMatchedBySourceActions)) {
168
- touchedFilePathsSet ++= findTouchedFiles0(" left_anti" )
169
- }
170
-
171
- val touchedFilePaths : Array [String ] = touchedFilePathsSet.toArray
172
- val unTouchedFilePaths = if (notMatchedActions.nonEmpty) {
173
- intersectionFilePaths.diff(touchedFilePathsSet).toArray
174
- } else {
175
- Array [String ]()
166
+ val noMatchedBySourceFilePaths = findTouchedFiles0(" left_anti" )
167
+ filePathsToRewritten ++= noMatchedBySourceFilePaths
168
+ filePathsToRead --= noMatchedBySourceFilePaths
176
169
}
177
170
178
- val (touchedFiles , touchedFileRelation) =
179
- createNewRelation(touchedFilePaths , dataFilePathToMeta, relation)
171
+ val (filesToRewritten , touchedFileRelation) =
172
+ createNewRelation(filePathsToRewritten.toArray , dataFilePathToMeta, relation)
180
173
val (_, unTouchedFileRelation) =
181
- createNewRelation(unTouchedFilePaths , dataFilePathToMeta, relation)
174
+ createNewRelation(filePathsToRead.toArray , dataFilePathToMeta, relation)
182
175
183
176
// Add FILE_TOUCHED_COL to mark the row as coming from the touched file, if the row has not been
184
177
// modified and was from touched file, it should be kept too.
185
- val touchedDsWithFileTouchedCol = createDataset(sparkSession, touchedFileRelation)
178
+ val targetDSWithFileTouchedCol = createDataset(sparkSession, touchedFileRelation)
186
179
.withColumn(FILE_TOUCHED_COL , lit(true ))
187
- val targetDSWithFileTouchedCol = touchedDsWithFileTouchedCol.union(
188
- createDataset(sparkSession, unTouchedFileRelation)
180
+ .union(createDataset(sparkSession, unTouchedFileRelation)
189
181
.withColumn(FILE_TOUCHED_COL , lit(false )))
190
182
191
- val toWriteDS =
192
- constructChangedRows(sparkSession, targetDSWithFileTouchedCol).drop(ROW_KIND_COL )
193
- val addCommitMessage = dvSafeWriter.write(toWriteDS)
194
- val deletedCommitMessage = buildDeletedCommitMessage(touchedFiles)
183
+ // If no files need to be rewritten, no need to write row lineage
184
+ val writeRowLineage = coreOptions.rowTrackingEnabled() && filesToRewritten.nonEmpty
185
+
186
+ val toWriteDS = constructChangedRows(
187
+ sparkSession,
188
+ targetDSWithFileTouchedCol,
189
+ writeRowLineage = writeRowLineage).drop(ROW_KIND_COL )
190
+
191
+ val writer = if (writeRowLineage) {
192
+ dvSafeWriter.withRowLineage()
193
+ } else {
194
+ dvSafeWriter
195
+ }
196
+ val addCommitMessage = writer.write(toWriteDS)
197
+ val deletedCommitMessage = buildDeletedCommitMessage(filesToRewritten)
195
198
196
199
addCommitMessage ++ deletedCommitMessage
197
200
}
@@ -203,7 +206,8 @@ case class MergeIntoPaimonTable(
203
206
targetDataset : Dataset [Row ],
204
207
remainDeletedRow : Boolean = false ,
205
208
deletionVectorEnabled : Boolean = false ,
206
- metadataCols : Seq [PaimonMetadataColumn ] = Seq .empty): Dataset [Row ] = {
209
+ extraMetadataCols : Seq [PaimonMetadataColumn ] = Seq .empty,
210
+ writeRowLineage : Boolean = false ): Dataset [Row ] = {
207
211
val targetDS = targetDataset
208
212
.withColumn(TARGET_ROW_COL , lit(true ))
209
213
@@ -217,25 +221,42 @@ case class MergeIntoPaimonTable(
217
221
resolveExpressions(sparkSession)(exprs, joinedPlan)
218
222
}
219
223
220
- val targetOutput = filteredTargetPlan.output
221
224
val targetRowNotMatched = resolveOnJoinedPlan(
222
225
Seq (toExpression(sparkSession, col(SOURCE_ROW_COL ).isNull))).head
223
226
val sourceRowNotMatched = resolveOnJoinedPlan(
224
227
Seq (toExpression(sparkSession, col(TARGET_ROW_COL ).isNull))).head
225
228
val matchedExprs = matchedActions.map(_.condition.getOrElse(TrueLiteral ))
226
229
val notMatchedExprs = notMatchedActions.map(_.condition.getOrElse(TrueLiteral ))
227
230
val notMatchedBySourceExprs = notMatchedBySourceActions.map(_.condition.getOrElse(TrueLiteral ))
228
- val noopOutput = targetOutput :+ Alias (Literal (NOOP_ROW_KIND_VALUE ), ROW_KIND_COL )()
229
- val keepOutput = targetOutput :+ Alias (Literal (RowKind .INSERT .toByteValue), ROW_KIND_COL )()
230
231
231
232
val resolver = sparkSession.sessionState.conf.resolver
232
- val metadataAttributes = metadataCols.flatMap {
233
- metadataCol => joinedPlan.output.find(attr => resolver(metadataCol.name, attr.name))
233
+ def attribute (name : String ) = joinedPlan.output.find(attr => resolver(name, attr.name))
234
+ val extraMetadataAttributes =
235
+ extraMetadataCols.flatMap(metadataCol => attribute(metadataCol.name))
236
+ val (rowIdAttr, sequenceNumberAttr) = if (writeRowLineage) {
237
+ (
238
+ attribute(SpecialFields .ROW_ID .name()).get,
239
+ attribute(SpecialFields .SEQUENCE_NUMBER .name()).get)
240
+ } else {
241
+ (null , null )
242
+ }
243
+
244
+ val targetOutput = if (writeRowLineage) {
245
+ filteredTargetPlan.output ++ Seq (rowIdAttr, sequenceNumberAttr)
246
+ } else {
247
+ filteredTargetPlan.output
234
248
}
249
+ val noopOutput = targetOutput :+ Alias (Literal (NOOP_ROW_KIND_VALUE ), ROW_KIND_COL )()
250
+ val keepOutput = targetOutput :+ Alias (Literal (RowKind .INSERT .toByteValue), ROW_KIND_COL )()
251
+
235
252
def processMergeActions (actions : Seq [MergeAction ]): Seq [Seq [Expression ]] = {
236
253
val columnExprs = actions.map {
237
254
case UpdateAction (_, assignments) =>
238
- assignments.map(_.value) :+ Literal (RowKind .UPDATE_AFTER .toByteValue)
255
+ var exprs = assignments.map(_.value)
256
+ if (writeRowLineage) {
257
+ exprs ++= Seq (rowIdAttr, Literal (null ))
258
+ }
259
+ exprs :+ Literal (RowKind .UPDATE_AFTER .toByteValue)
239
260
case DeleteAction (_) =>
240
261
if (remainDeletedRow || deletionVectorEnabled) {
241
262
targetOutput :+ Literal (RowKind .DELETE .toByteValue)
@@ -245,17 +266,26 @@ case class MergeIntoPaimonTable(
245
266
noopOutput
246
267
}
247
268
case InsertAction (_, assignments) =>
248
- assignments.map(_.value) :+ Literal (RowKind .INSERT .toByteValue)
269
+ var exprs = assignments.map(_.value)
270
+ if (writeRowLineage) {
271
+ exprs ++= Seq (rowIdAttr, sequenceNumberAttr)
272
+ }
273
+ exprs :+ Literal (RowKind .INSERT .toByteValue)
249
274
}
250
- columnExprs.map(exprs => exprs ++ metadataAttributes)
275
+
276
+ columnExprs.map(exprs => exprs ++ extraMetadataAttributes)
251
277
}
252
278
253
279
val matchedOutputs = processMergeActions(matchedActions)
254
280
val notMatchedBySourceOutputs = processMergeActions(notMatchedBySourceActions)
255
281
val notMatchedOutputs = processMergeActions(notMatchedActions)
256
282
val outputFields = mutable.ArrayBuffer (tableSchema.fields: _* )
283
+ if (writeRowLineage) {
284
+ outputFields += PaimonMetadataColumn .ROW_ID .toStructField
285
+ outputFields += PaimonMetadataColumn .SEQUENCE_NUMBER .toStructField
286
+ }
257
287
outputFields += StructField (ROW_KIND_COL , ByteType )
258
- outputFields ++= metadataCols .map(_.toStructField)
288
+ outputFields ++= extraMetadataCols .map(_.toStructField)
259
289
val outputSchema = StructType (outputFields.toSeq)
260
290
261
291
val joinedRowEncoder = EncoderUtils .encode(joinedPlan.schema)
0 commit comments