From 95802482f2816e36cdfe03d4c30cf92391dfaf80 Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Fri, 7 Nov 2025 16:52:02 +0100 Subject: [PATCH 01/20] length of edit script is correct, working on path --- .../jetbrains/kotlinx/dataframe/api/add.kt | 1 - .../dataframe/impl/api/compareDataFrames.kt | 134 ++++++++++++++++++ .../jetbrains/kotlinx/dataframe/api/add.kt | 25 ++++ 3 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt index 4aed230f44..a16e153df8 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt @@ -15,7 +15,6 @@ import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload import org.jetbrains.kotlinx.dataframe.annotations.HasSchema import org.jetbrains.kotlinx.dataframe.annotations.Interpretable import org.jetbrains.kotlinx.dataframe.annotations.Refine -import org.jetbrains.kotlinx.dataframe.api.add import org.jetbrains.kotlinx.dataframe.columns.BaseColumn import org.jetbrains.kotlinx.dataframe.columns.ColumnAccessor import org.jetbrains.kotlinx.dataframe.columns.ColumnPath diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt new file mode 100644 index 0000000000..766e58da58 --- /dev/null +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -0,0 +1,134 @@ +package org.jetbrains.kotlinx.dataframe.impl.api + +// same schema, returns modified, added and removed rows, first try with string + +internal fun compareImpl(dfA: String, dfB: String): MutableList> { + // what i want Myers alg + val path = mutableListOf>() + var sesLength: Int? + // Myers algorithm, preparing + val sum_of_length = dfA.length + dfB.length + val v = arrayListOf() + for (d in 0..sum_of_length) { + v.add(IntArray(sum_of_length * 2 + 1)) + } + var isOver = false + // starting the algorithm + // 0 position is -(M+N) position in the alg's paper -> need to normalize each access to v + v[0][1 + sum_of_length] = 0 // fitticious + var d = 0 + while (d <= sum_of_length && !isOver) { + for (k in -d..d step 2) { + var x: Int? + if (k == -d || k != d && v[d][k - 1 + sum_of_length] < v[d][k + 1 + sum_of_length]) { + x = v[d][k + 1 + sum_of_length] + } else { + x = v[d][k - 1 + sum_of_length] + 1 + } + var y = x - k + while (x < dfA.length && y < dfB.length && dfA[x] == dfB[y]) { + x += 1 + y += 1 + } + v[d][k + sum_of_length] = x + // need these datas in next iteration + if (d < sum_of_length) { + v[d + 1][k + sum_of_length] = x + } + // + if (x >= dfA.length && y >= dfB.length) { + isOver = true + val d1 = d + recoursive_path_fill(path, v, d1, k, sum_of_length, dfA, dfB) + // if i am the last (not only) i am a furthest reachin endpoint + path.add(Pair(0, 0)) + break + } + } + d++ // try with a longer edit script + } + return path +} + +internal fun recoursive_path_fill( + path: MutableList>, + v: ArrayList, + d: Int, + k: Int, + sum_of_length: Int, + dfA: String, + dfB: String, +) { + if (d < 0) { + return + } + // enlist my self + val xCurrent = v[d][k + sum_of_length] + val yCurrent = xCurrent - k + path.add(Pair(xCurrent, yCurrent)) + // choose the next one to enlist (my previous' furthest reaching endpoint) + val dNext = d - 1 + val kNext1 = k + 1 + val kNext2 = k - 1 + val listOfPossibleNext = listOf(kNext1, kNext2) + var x: Int? + var y: Int? + // I try my possible next to reach my self + for (kCurrent in listOfPossibleNext) { + if (kCurrent == -dNext || + kCurrent != dNext && + v[dNext][kCurrent - 1 + sum_of_length] < v[dNext][k + 1 + sum_of_length] + ) { + x = v[dNext][k + 1 + sum_of_length] + } else { + x = v[dNext][k - 1 + sum_of_length] + 1 + } + y = x - k + // eventual snake before me + val snake = mutableListOf>() + while (x < dfA.length && y < dfB.length && x >= 0 && y>=0 && dfA[x] == dfB[y]) { + x += 1 + y += 1 + snake.add(Pair(x, y)) + } + if(snake.isNotEmpty()) + snake.removeLast() // will be eventually added to path in the next recoursive step + if (x == xCurrent && y == yCurrent) { + for (e in snake) { + path.add(e) + } + recoursive_path_fill(path, v, dNext, kCurrent, sum_of_length, dfA, dfB) + return + } + } +} + +// internal fun compareImpl(dfA: DataFrame, dfB: DataFrame): Iterable> { +// +// //Myers algorithm +// val sum_of_length = dfA.nrow+dfB.nrow +// val v = IntArray(sum_of_length*2+1) //0 position is -(M+N) position in the alg's paper -> need to normalize each access to v +// var isOver=false +// +// v[1+sum_of_length]=0 +// var d=0 +// while(d<=sum_of_length && !isOver){ +// for(k in -d .. d step 2){ +// var x: Int? +// var y: Int? +// if (k==-d || k!=d && v[k-1+sum_of_length] < v[k+1+sum_of_length]) +// x = v[k+1+sum_of_length] +// else +// x = v[k-1+sum_of_length] +// y = x-k +// while (x < dfA.nrow && y= dfA.nrow && y>=dfB.nrow) +// isOver=true +// } +// d++ +// } +// } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt index 8b0b8143ac..a01ed59f07 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt @@ -4,6 +4,7 @@ import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.shouldBe import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.impl.api.compareImpl import org.junit.Test import kotlin.reflect.typeOf @@ -58,4 +59,28 @@ class AddTests { df["fibonacci1"].toList() shouldBe listOf(1, 1, 2, 3, 5, 8, 13, 21, 34, 55) df["fibonacci2"].toList() shouldBe listOf(1, 1, 2, 3, 5, 8, 13, 21, 34, 55) } + + @Test + fun `compare`() { + val path = compareImpl("abcabba", "cbabac") + path shouldBe listOf>(Pair(0,0), Pair(3, 1)) + } + + @Test + fun `compare2`() { + val path = compareImpl("a", "a") + path shouldBe 0 + } + + @Test + fun `compare3`() { + val path = compareImpl("a", "ab") + path shouldBe 1 + } + + @Test + fun `compare4`() { + val path = compareImpl("ab", "a") + path shouldBe 1 + } } From eeee3f32ffa2c4e425c850e0c914d153d155a5f1 Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Sat, 8 Nov 2025 09:24:30 +0100 Subject: [PATCH 02/20] trying --- .../dataframe/impl/api/compareDataFrames.kt | 90 +++++++------------ .../jetbrains/kotlinx/dataframe/api/add.kt | 2 +- 2 files changed, 33 insertions(+), 59 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 766e58da58..f135e96c99 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -5,6 +5,7 @@ package org.jetbrains.kotlinx.dataframe.impl.api internal fun compareImpl(dfA: String, dfB: String): MutableList> { // what i want Myers alg val path = mutableListOf>() + //val path = linkedSetOf>() var sesLength: Int? // Myers algorithm, preparing val sum_of_length = dfA.length + dfB.length @@ -40,7 +41,7 @@ internal fun compareImpl(dfA: String, dfB: String): MutableList> isOver = true val d1 = d recoursive_path_fill(path, v, d1, k, sum_of_length, dfA, dfB) - // if i am the last (not only) i am a furthest reachin endpoint + // if i am the last (not only) i am a furthest reaching endpoint path.add(Pair(0, 0)) break } @@ -67,68 +68,41 @@ internal fun recoursive_path_fill( val yCurrent = xCurrent - k path.add(Pair(xCurrent, yCurrent)) // choose the next one to enlist (my previous' furthest reaching endpoint) - val dNext = d - 1 - val kNext1 = k + 1 - val kNext2 = k - 1 - val listOfPossibleNext = listOf(kNext1, kNext2) - var x: Int? - var y: Int? - // I try my possible next to reach my self - for (kCurrent in listOfPossibleNext) { - if (kCurrent == -dNext || - kCurrent != dNext && - v[dNext][kCurrent - 1 + sum_of_length] < v[dNext][k + 1 + sum_of_length] - ) { - x = v[dNext][k + 1 + sum_of_length] +// To list an optimal path from (0,0) to the point Vd[k] first deter +// mine whether it is at the end of a maximal snake following a vertical edge from Vd−1[k+1] or a horizontal edge +// from Vd−1[k−1] + + + var kTry1 = k + 1 + var kTry2 = k - 1 + val tries = listOf(kTry1, kTry2) + for (kT in tries) { + var xPrev = v[d][kT+sum_of_length] + var yPrev = xPrev - kT + if (kT == kTry1) { + yPrev++ } else { - x = v[dNext][k - 1 + sum_of_length] + 1 + xPrev++ } - y = x - k - // eventual snake before me val snake = mutableListOf>() - while (x < dfA.length && y < dfB.length && x >= 0 && y>=0 && dfA[x] == dfB[y]) { - x += 1 - y += 1 - snake.add(Pair(x, y)) - } - if(snake.isNotEmpty()) - snake.removeLast() // will be eventually added to path in the next recoursive step - if (x == xCurrent && y == yCurrent) { - for (e in snake) { - path.add(e) + while (xPrev <= xCurrent && yPrev <= yCurrent) { //loop is done at least once + snake.add(Pair(xPrev, yPrev)) + if (xPrev == xCurrent && yPrev == yCurrent) { + if (snake.isNotEmpty()) { + snake.removeLast() + } + for (e in snake) { + path.add(e) //da cambiare + } + recoursive_path_fill(path, v, d - 1, kT, sum_of_length, dfA, dfB) + return } - recoursive_path_fill(path, v, dNext, kCurrent, sum_of_length, dfA, dfB) - return + xPrev += 1 + yPrev += 1 } } + } -// internal fun compareImpl(dfA: DataFrame, dfB: DataFrame): Iterable> { -// -// //Myers algorithm -// val sum_of_length = dfA.nrow+dfB.nrow -// val v = IntArray(sum_of_length*2+1) //0 position is -(M+N) position in the alg's paper -> need to normalize each access to v -// var isOver=false -// -// v[1+sum_of_length]=0 -// var d=0 -// while(d<=sum_of_length && !isOver){ -// for(k in -d .. d step 2){ -// var x: Int? -// var y: Int? -// if (k==-d || k!=d && v[k-1+sum_of_length] < v[k+1+sum_of_length]) -// x = v[k+1+sum_of_length] -// else -// x = v[k-1+sum_of_length] -// y = x-k -// while (x < dfA.nrow && y= dfA.nrow && y>=dfB.nrow) -// isOver=true -// } -// d++ -// } -// } + + diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt index a01ed59f07..04515749fe 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt @@ -63,7 +63,7 @@ class AddTests { @Test fun `compare`() { val path = compareImpl("abcabba", "cbabac") - path shouldBe listOf>(Pair(0,0), Pair(3, 1)) + path shouldBe listOf>(Pair(0, 0), Pair(3, 1)) } @Test From 5fd2bfaa8fc77d7f5d92f79c34ff907654e0493a Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Sat, 8 Nov 2025 15:26:15 +0100 Subject: [PATCH 03/20] this is working but snake before f.r.e. --- .../dataframe/impl/api/compareDataFrames.kt | 58 ++++++++++--------- .../jetbrains/kotlinx/dataframe/api/add.kt | 10 +++- 2 files changed, 39 insertions(+), 29 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index f135e96c99..7ef0017812 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -3,9 +3,8 @@ package org.jetbrains.kotlinx.dataframe.impl.api // same schema, returns modified, added and removed rows, first try with string internal fun compareImpl(dfA: String, dfB: String): MutableList> { - // what i want Myers alg + // what i want from Myers alg val path = mutableListOf>() - //val path = linkedSetOf>() var sesLength: Int? // Myers algorithm, preparing val sum_of_length = dfA.length + dfB.length @@ -42,7 +41,6 @@ internal fun compareImpl(dfA: String, dfB: String): MutableList> val d1 = d recoursive_path_fill(path, v, d1, k, sum_of_length, dfA, dfB) // if i am the last (not only) i am a furthest reaching endpoint - path.add(Pair(0, 0)) break } } @@ -72,36 +70,40 @@ internal fun recoursive_path_fill( // mine whether it is at the end of a maximal snake following a vertical edge from Vd−1[k+1] or a horizontal edge // from Vd−1[k−1] - - var kTry1 = k + 1 - var kTry2 = k - 1 - val tries = listOf(kTry1, kTry2) - for (kT in tries) { - var xPrev = v[d][kT+sum_of_length] - var yPrev = xPrev - kT - if (kT == kTry1) { - yPrev++ - } else { - xPrev++ - } - val snake = mutableListOf>() - while (xPrev <= xCurrent && yPrev <= yCurrent) { //loop is done at least once - snake.add(Pair(xPrev, yPrev)) - if (xPrev == xCurrent && yPrev == yCurrent) { - if (snake.isNotEmpty()) { - snake.removeLast() + if(d>0) { + var kTry1 = k + 1 + var kTry2 = k - 1 + val tries = listOf(kTry1, kTry2) + for (kT in tries) { + var xPrev = v[d - 1][kT + sum_of_length] + var yPrev = xPrev - kT + if (kT == kTry1) { + yPrev++ + } else { + xPrev++ + } + val snake = mutableListOf>() + while (xPrev <= xCurrent && yPrev <= yCurrent) { //loop is done at least once + snake.add(Pair(xPrev, yPrev)) + if (xPrev == xCurrent && yPrev == yCurrent) { + if (snake.isNotEmpty()) { + snake.removeLast() + for (e in snake) { + path.add(e) //da cambiare + } + } + recoursive_path_fill(path, v, d - 1, kT, sum_of_length, dfA, dfB) + return } - for (e in snake) { - path.add(e) //da cambiare + if(xPrev < dfA.length && yPrev < dfB.length && xPrev >= 0 && yPrev >= 0 && dfA[xPrev] == dfB[yPrev]) { + xPrev += 1 + yPrev += 1 } - recoursive_path_fill(path, v, d - 1, kT, sum_of_length, dfA, dfB) - return + else + break } - xPrev += 1 - yPrev += 1 } } - } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt index 04515749fe..bf5873b25a 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt @@ -63,7 +63,15 @@ class AddTests { @Test fun `compare`() { val path = compareImpl("abcabba", "cbabac") - path shouldBe listOf>(Pair(0, 0), Pair(3, 1)) + path shouldBe listOf() +// path[5][1+13] shouldBe 7 +// //path[4][1+13] shouldBe 7 +// +// path[4][2+13] shouldBe 7 +// //path[3][2+13] shouldBe 7 +// +// path[3][1+13] shouldBe 5 +// //path[2][1+13] shouldBe 5 } @Test From fccaaf6c67f5ca73375d0c64416cbec2b168c5bb Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Sat, 8 Nov 2025 16:17:17 +0100 Subject: [PATCH 04/20] cleaning --- .../dataframe/impl/api/compareDataFrames.kt | 20 +++++++++---------- .../jetbrains/kotlinx/dataframe/api/add.kt | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 7ef0017812..359cd7125d 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -46,6 +46,7 @@ internal fun compareImpl(dfA: String, dfB: String): MutableList> } d++ // try with a longer edit script } + path.reverse() return path } @@ -70,9 +71,9 @@ internal fun recoursive_path_fill( // mine whether it is at the end of a maximal snake following a vertical edge from Vd−1[k+1] or a horizontal edge // from Vd−1[k−1] - if(d>0) { - var kTry1 = k + 1 - var kTry2 = k - 1 + if (d > 0) { + val kTry1 = k + 1 + val kTry2 = k - 1 val tries = listOf(kTry1, kTry2) for (kT in tries) { var xPrev = v[d - 1][kT + sum_of_length] @@ -83,7 +84,8 @@ internal fun recoursive_path_fill( xPrev++ } val snake = mutableListOf>() - while (xPrev <= xCurrent && yPrev <= yCurrent) { //loop is done at least once + var skipThisRoundOfOuterLoop = false + do { snake.add(Pair(xPrev, yPrev)) if (xPrev == xCurrent && yPrev == yCurrent) { if (snake.isNotEmpty()) { @@ -95,16 +97,14 @@ internal fun recoursive_path_fill( recoursive_path_fill(path, v, d - 1, kT, sum_of_length, dfA, dfB) return } - if(xPrev < dfA.length && yPrev < dfB.length && xPrev >= 0 && yPrev >= 0 && dfA[xPrev] == dfB[yPrev]) { + if (xPrev < dfA.length && yPrev < dfB.length && xPrev >= 0 && yPrev >= 0 && dfA[xPrev] == dfB[yPrev]) { xPrev += 1 yPrev += 1 + } else { + skipThisRoundOfOuterLoop = true } - else - break } + while (xPrev <= xCurrent && yPrev <= yCurrent && !skipThisRoundOfOuterLoop) } } } - - - diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt index bf5873b25a..a1961ae40e 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt @@ -76,8 +76,8 @@ class AddTests { @Test fun `compare2`() { - val path = compareImpl("a", "a") - path shouldBe 0 + val path = compareImpl("aaaab", "aaaac") + path shouldBe listOf() } @Test From bcc41e09e05de8108bd50695de023f26e148842f Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Sun, 9 Nov 2025 09:06:11 +0100 Subject: [PATCH 05/20] cleaning --- .../dataframe/impl/api/compareDataFrames.kt | 62 ++++++++++--------- .../jetbrains/kotlinx/dataframe/api/add.kt | 10 +-- 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 359cd7125d..3db43b1312 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -1,45 +1,51 @@ package org.jetbrains.kotlinx.dataframe.impl.api -// same schema, returns modified, added and removed rows, first try with string -internal fun compareImpl(dfA: String, dfB: String): MutableList> { +/** + * dfs (will be) with same schema, returns the path from origin to (N,M) in the edit path. + * N is dfA.nrow, M is dfB.nrow + * knowing this path is knowing the differences between dfA and dfB. + * cost of this alg's worst case in O( (N+M)D ), D is the length of shortest edit script + */ +internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList> { // what i want from Myers alg val path = mutableListOf>() - var sesLength: Int? + var sesLength: Int? // ses stands for shortest edit script // Myers algorithm, preparing - val sum_of_length = dfA.length + dfB.length + val sumOfLength = dfA.length + dfB.length val v = arrayListOf() - for (d in 0..sum_of_length) { - v.add(IntArray(sum_of_length * 2 + 1)) + for (d in 0..sumOfLength) { + v.add(IntArray(sumOfLength * 2 + 1)) } var isOver = false // starting the algorithm // 0 position is -(M+N) position in the alg's paper -> need to normalize each access to v - v[0][1 + sum_of_length] = 0 // fitticious + val normalizer = sumOfLength + v[0][1 + normalizer] = 0 // fitticious var d = 0 - while (d <= sum_of_length && !isOver) { + while (d <= sumOfLength && !isOver) { for (k in -d..d step 2) { var x: Int? - if (k == -d || k != d && v[d][k - 1 + sum_of_length] < v[d][k + 1 + sum_of_length]) { - x = v[d][k + 1 + sum_of_length] + if (k == -d || k != d && v[d][k - 1 + normalizer] < v[d][k + 1 + normalizer]) { + x = v[d][k + 1 + normalizer] } else { - x = v[d][k - 1 + sum_of_length] + 1 + x = v[d][k - 1 + normalizer] + 1 } var y = x - k while (x < dfA.length && y < dfB.length && dfA[x] == dfB[y]) { x += 1 y += 1 } - v[d][k + sum_of_length] = x + v[d][k + normalizer] = x // need these datas in next iteration - if (d < sum_of_length) { - v[d + 1][k + sum_of_length] = x + if (d < sumOfLength) { + v[d + 1][k + normalizer] = x } // if (x >= dfA.length && y >= dfB.length) { isOver = true val d1 = d - recoursive_path_fill(path, v, d1, k, sum_of_length, dfA, dfB) + recoursivePathFill(path, v, d1, k, normalizer, dfA, dfB) // if i am the last (not only) i am a furthest reaching endpoint break } @@ -50,33 +56,33 @@ internal fun compareImpl(dfA: String, dfB: String): MutableList> return path } -internal fun recoursive_path_fill( +internal fun recoursivePathFill( path: MutableList>, v: ArrayList, d: Int, k: Int, - sum_of_length: Int, + normalizer: Int, dfA: String, dfB: String, ) { + //basic step if (d < 0) { return } // enlist my self - val xCurrent = v[d][k + sum_of_length] + val xCurrent = v[d][k + normalizer] val yCurrent = xCurrent - k path.add(Pair(xCurrent, yCurrent)) - // choose the next one to enlist (my previous' furthest reaching endpoint) -// To list an optimal path from (0,0) to the point Vd[k] first deter -// mine whether it is at the end of a maximal snake following a vertical edge from Vd−1[k+1] or a horizontal edge -// from Vd−1[k−1] - + // choose the furthest reaching endpoint that precedes me + // To list an optimal path from (0,0) to the point Vd[k] first determine + // whether it is at the end of a maximal snake following a vertical edge from Vd−1[k+1] or a horizontal edge + // from Vd−1[k−1] if (d > 0) { val kTry1 = k + 1 val kTry2 = k - 1 val tries = listOf(kTry1, kTry2) for (kT in tries) { - var xPrev = v[d - 1][kT + sum_of_length] + var xPrev = v[d - 1][kT + normalizer] var yPrev = xPrev - kT if (kT == kTry1) { yPrev++ @@ -86,15 +92,15 @@ internal fun recoursive_path_fill( val snake = mutableListOf>() var skipThisRoundOfOuterLoop = false do { - snake.add(Pair(xPrev, yPrev)) + snake.add(0, Pair(xPrev, yPrev)) if (xPrev == xCurrent && yPrev == yCurrent) { if (snake.isNotEmpty()) { - snake.removeLast() + snake.removeFirst() for (e in snake) { - path.add(e) //da cambiare + path.add(e) // da cambiare } } - recoursive_path_fill(path, v, d - 1, kT, sum_of_length, dfA, dfB) + recoursivePathFill(path, v, d - 1, kT, normalizer, dfA, dfB) return } if (xPrev < dfA.length && yPrev < dfB.length && xPrev >= 0 && yPrev >= 0 && dfA[xPrev] == dfB[yPrev]) { diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt index a1961ae40e..3ba8a43a15 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt @@ -4,7 +4,7 @@ import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.shouldBe import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame -import org.jetbrains.kotlinx.dataframe.impl.api.compareImpl +import org.jetbrains.kotlinx.dataframe.impl.api.myersDifferenceAlgorithmImpl import org.junit.Test import kotlin.reflect.typeOf @@ -62,7 +62,7 @@ class AddTests { @Test fun `compare`() { - val path = compareImpl("abcabba", "cbabac") + val path = myersDifferenceAlgorithmImpl("abcabba", "cbabac") path shouldBe listOf() // path[5][1+13] shouldBe 7 // //path[4][1+13] shouldBe 7 @@ -76,19 +76,19 @@ class AddTests { @Test fun `compare2`() { - val path = compareImpl("aaaab", "aaaac") + val path = myersDifferenceAlgorithmImpl("aaaab", "aaaac") path shouldBe listOf() } @Test fun `compare3`() { - val path = compareImpl("a", "ab") + val path = myersDifferenceAlgorithmImpl("a", "ab") path shouldBe 1 } @Test fun `compare4`() { - val path = compareImpl("ab", "a") + val path = myersDifferenceAlgorithmImpl("ab", "a") path shouldBe 1 } } From 51e2b239e546743be1b3ba81b335203ee92d9b10 Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Sun, 9 Nov 2025 09:47:01 +0100 Subject: [PATCH 06/20] refining logic --- .../dataframe/impl/api/compareDataFrames.kt | 35 +++++++++++++------ .../jetbrains/kotlinx/dataframe/api/add.kt | 2 +- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 3db43b1312..3fe03b0bb1 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -1,11 +1,12 @@ package org.jetbrains.kotlinx.dataframe.impl.api - /** - * dfs (will be) with same schema, returns the path from origin to (N,M) in the edit path. - * N is dfA.nrow, M is dfB.nrow - * knowing this path is knowing the differences between dfA and dfB. - * cost of this alg's worst case in O( (N+M)D ), D is the length of shortest edit script + * dfs (will be) with same schema. Returns the path from origin to (N,M) in the edit path. + * N is dfA.nrow, M is dfB.nrow. + * Knowing this path is knowing the differences between dfA and dfB + * and the shortest edit script to get B from A. + * cost of this alg's worst case in O( (N+M)D ), D is the length of shortest edit script. + * snake: a set of diagonal edges, possibly empty */ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList> { // what i want from Myers alg @@ -52,7 +53,6 @@ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList } d++ // try with a longer edit script } - path.reverse() return path } @@ -65,10 +65,6 @@ internal fun recoursivePathFill( dfA: String, dfB: String, ) { - //basic step - if (d < 0) { - return - } // enlist my self val xCurrent = v[d][k + normalizer] val yCurrent = xCurrent - k @@ -97,7 +93,7 @@ internal fun recoursivePathFill( if (snake.isNotEmpty()) { snake.removeFirst() for (e in snake) { - path.add(e) // da cambiare + path.add(e) } } recoursivePathFill(path, v, d - 1, kT, normalizer, dfA, dfB) @@ -113,4 +109,21 @@ internal fun recoursivePathFill( while (xPrev <= xCurrent && yPrev <= yCurrent && !skipThisRoundOfOuterLoop) } } + // step base, + // eventually need to build the snake from origin to the furthest reaching point with d=0 + // moreover the path is reversed so that it can be read from left to right correctly + if (d == 0) { + if (path.last().first != 0 && path.last().second != 0) { + val last = path.last() + var x = last.first - 1 + var y = last.second - 1 + while (x >= 0 && y >= 0) { + path.add(Pair(x, y)) + x -= 1 + y -= 1 + } + } + path.reverse() + return + } } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt index 3ba8a43a15..1744c5efcd 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt @@ -76,7 +76,7 @@ class AddTests { @Test fun `compare2`() { - val path = myersDifferenceAlgorithmImpl("aaaab", "aaaac") + val path = myersDifferenceAlgorithmImpl("aaaa", "aaaa") path shouldBe listOf() } From f4b21e314da5950eb02b6bd82ad5ce20200b3fdd Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Mon, 10 Nov 2025 17:34:09 +0100 Subject: [PATCH 07/20] algorythm works good with strings, next step is swithcing to dataFrames, there is no difference in the logic --- .../dataframe/impl/api/compareDataFrames.kt | 11 +++--- .../jetbrains/kotlinx/dataframe/api/add.kt | 32 --------------- .../dataframe/api/compareDataFrames.kt | 39 +++++++++++++++++++ 3 files changed, 45 insertions(+), 37 deletions(-) create mode 100644 core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 3fe03b0bb1..8e7a40b5ef 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -9,9 +9,11 @@ package org.jetbrains.kotlinx.dataframe.impl.api * snake: a set of diagonal edges, possibly empty */ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList> { - // what i want from Myers alg + // what I want from Myers alg val path = mutableListOf>() - var sesLength: Int? // ses stands for shortest edit script + // 'ses' stands for shortest edit script, next var is never returned, it is in the code + // to show the capabilities of the algorithm + var sesLength: Int? // Myers algorithm, preparing val sumOfLength = dfA.length + dfB.length val v = arrayListOf() @@ -45,9 +47,8 @@ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList // if (x >= dfA.length && y >= dfB.length) { isOver = true - val d1 = d - recoursivePathFill(path, v, d1, k, normalizer, dfA, dfB) - // if i am the last (not only) i am a furthest reaching endpoint + sesLength = d + recoursivePathFill(path, v, d, k, normalizer, dfA, dfB) break } } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt index 1744c5efcd..a2beae527a 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt @@ -59,36 +59,4 @@ class AddTests { df["fibonacci1"].toList() shouldBe listOf(1, 1, 2, 3, 5, 8, 13, 21, 34, 55) df["fibonacci2"].toList() shouldBe listOf(1, 1, 2, 3, 5, 8, 13, 21, 34, 55) } - - @Test - fun `compare`() { - val path = myersDifferenceAlgorithmImpl("abcabba", "cbabac") - path shouldBe listOf() -// path[5][1+13] shouldBe 7 -// //path[4][1+13] shouldBe 7 -// -// path[4][2+13] shouldBe 7 -// //path[3][2+13] shouldBe 7 -// -// path[3][1+13] shouldBe 5 -// //path[2][1+13] shouldBe 5 - } - - @Test - fun `compare2`() { - val path = myersDifferenceAlgorithmImpl("aaaa", "aaaa") - path shouldBe listOf() - } - - @Test - fun `compare3`() { - val path = myersDifferenceAlgorithmImpl("a", "ab") - path shouldBe 1 - } - - @Test - fun `compare4`() { - val path = myersDifferenceAlgorithmImpl("ab", "a") - path shouldBe 1 - } } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt new file mode 100644 index 0000000000..487d165127 --- /dev/null +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt @@ -0,0 +1,39 @@ +package org.jetbrains.kotlinx.dataframe.api + +import io.kotest.matchers.shouldBe +import org.jetbrains.kotlinx.dataframe.impl.api.myersDifferenceAlgorithmImpl +import org.junit.Test + +class CompareDataFramesTest { + @Test + fun `Need both to delete and insert rows, preserving some rows`() { + val path = myersDifferenceAlgorithmImpl("abcabba", "cbabac") + path shouldBe listOf() +// path[5][1+13] shouldBe 7 +// //path[4][1+13] shouldBe 7 +// +// path[4][2+13] shouldBe 7 +// //path[3][2+13] shouldBe 7 +// +// path[3][1+13] shouldBe 5 +// //path[2][1+13] shouldBe 5 + } + + @Test + fun `need to do nothing`() { + val path = myersDifferenceAlgorithmImpl("aaaa", "aaaa") + path shouldBe listOf() + } + + @Test + fun `need to remove each row of dfA and insert each row of dfB`() { + val path = myersDifferenceAlgorithmImpl("abcd", "efgh") + path shouldBe listOf() + } + + @Test + fun `need to add each row`() { + val path = myersDifferenceAlgorithmImpl("", "abc") + path shouldBe listOf() + } +} From b25c712541671716a922713a63b905b6a35384ce Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Mon, 10 Nov 2025 17:50:57 +0100 Subject: [PATCH 08/20] cleaning --- core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt | 1 + core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt index a16e153df8..4aed230f44 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt @@ -15,6 +15,7 @@ import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload import org.jetbrains.kotlinx.dataframe.annotations.HasSchema import org.jetbrains.kotlinx.dataframe.annotations.Interpretable import org.jetbrains.kotlinx.dataframe.annotations.Refine +import org.jetbrains.kotlinx.dataframe.api.add import org.jetbrains.kotlinx.dataframe.columns.BaseColumn import org.jetbrains.kotlinx.dataframe.columns.ColumnAccessor import org.jetbrains.kotlinx.dataframe.columns.ColumnPath diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt index a2beae527a..8b0b8143ac 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/add.kt @@ -4,7 +4,6 @@ import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.shouldBe import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame -import org.jetbrains.kotlinx.dataframe.impl.api.myersDifferenceAlgorithmImpl import org.junit.Test import kotlin.reflect.typeOf From 026ffe6b455b83bd0e7d5ad82e743cb9a5ca9c1e Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Mon, 10 Nov 2025 18:10:37 +0100 Subject: [PATCH 09/20] tests --- .../dataframe/api/compareDataFrames.kt | 49 ++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt index 487d165127..1d364a0676 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt @@ -3,37 +3,62 @@ package org.jetbrains.kotlinx.dataframe.api import io.kotest.matchers.shouldBe import org.jetbrains.kotlinx.dataframe.impl.api.myersDifferenceAlgorithmImpl import org.junit.Test +import kotlin.Pair class CompareDataFramesTest { @Test fun `Need both to delete and insert rows, preserving some rows`() { val path = myersDifferenceAlgorithmImpl("abcabba", "cbabac") - path shouldBe listOf() -// path[5][1+13] shouldBe 7 -// //path[4][1+13] shouldBe 7 -// -// path[4][2+13] shouldBe 7 -// //path[3][2+13] shouldBe 7 -// -// path[3][1+13] shouldBe 5 -// //path[2][1+13] shouldBe 5 + path shouldBe listOf( + Pair(0, 0), + Pair(1, 0), + Pair(2, 0), + Pair(3, 1), + Pair(4, 1), + Pair(5, 2), + Pair(5, 3), + Pair(6, 4), + Pair(7, 5), + Pair(7, 6), + ) } @Test fun `need to do nothing`() { val path = myersDifferenceAlgorithmImpl("aaaa", "aaaa") - path shouldBe listOf() + path shouldBe listOf( + Pair(0, 0), + Pair(1, 1), + Pair(2, 2), + Pair(3, 3), + Pair(4, 4), + ) } @Test fun `need to remove each row of dfA and insert each row of dfB`() { val path = myersDifferenceAlgorithmImpl("abcd", "efgh") - path shouldBe listOf() + path shouldBe listOf( + Pair(0, 0), + Pair(1, 0), + Pair(2, 0), + Pair(3, 0), + Pair(4, 0), + Pair(4, 1), + Pair(4, 2), + Pair(4, 3), + Pair(4, 4), + ) } @Test fun `need to add each row`() { val path = myersDifferenceAlgorithmImpl("", "abc") - path shouldBe listOf() + path shouldBe listOf( + Pair(0, 0), + Pair(0, 1), + Pair(0, 2), + Pair(0, 3), + ) } } From 2de3ad14c198878aaf4cc9b6edb774026c19a2cc Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Fri, 14 Nov 2025 19:18:57 +0100 Subject: [PATCH 10/20] improve logic --- .../kotlinx/dataframe/columns/ValueColumn.kt | 2 + .../dataframe/impl/api/compareDataFrames.kt | 99 ++++++++++--------- .../dataframe/api/compareDataFrames.kt | 6 +- 3 files changed, 58 insertions(+), 49 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/columns/ValueColumn.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/columns/ValueColumn.kt index 17c92a44d5..8870fb86b5 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/columns/ValueColumn.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/columns/ValueColumn.kt @@ -1,6 +1,8 @@ package org.jetbrains.kotlinx.dataframe.columns import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.api.maxBy +import kotlin.getValue import kotlin.reflect.KProperty /** diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 8e7a40b5ef..f43946d188 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -1,21 +1,27 @@ package org.jetbrains.kotlinx.dataframe.impl.api /** - * dfs (will be) with same schema. Returns the path from origin to (N,M) in the edit path. + * dfs (will be) with same schema. Returns the path from origin to (N,M) in the edit graph. * N is dfA.nrow, M is dfB.nrow. * Knowing this path is knowing the differences between dfA and dfB * and the shortest edit script to get B from A. - * cost of this alg's worst case in O( (N+M)D ), D is the length of shortest edit script. - * snake: a set of diagonal edges, possibly empty + * The cost of this alg's worst case in O( (N+M)D ), D is the length of shortest edit script. + * + * The idea of the algorithm is the following: try to cross the edit graph making 'd' non-diagonal moves, + * increase 'd' until you succeed. + * Non-diagonal moves make edit script longer, while diagonal moves do not. + * + * snake: non-diagonal edge and then a possibly empty sequence of diagonal edges called a + * furthest reaching D-path endpoint: The endpoint of the longest d-path */ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList> { - // what I want from Myers alg + // Return value val path = mutableListOf>() // 'ses' stands for shortest edit script, next var is never returned, it is in the code // to show the capabilities of the algorithm var sesLength: Int? - // Myers algorithm, preparing val sumOfLength = dfA.length + dfB.length + // matrix containing the furthest reaching endpoints for each d val v = arrayListOf() for (d in 0..sumOfLength) { v.add(IntArray(sumOfLength * 2 + 1)) @@ -40,11 +46,11 @@ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList y += 1 } v[d][k + normalizer] = x - // need these datas in next iteration + // need this data in the next iteration if (d < sumOfLength) { v[d + 1][k + normalizer] = x } - // + // Edit graph was fully crossed if (x >= dfA.length && y >= dfB.length) { isOver = true sesLength = d @@ -52,7 +58,8 @@ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList break } } - d++ // try with a longer edit script + // try with a longer edit script + d++ } return path } @@ -66,53 +73,53 @@ internal fun recoursivePathFill( dfA: String, dfB: String, ) { - // enlist my self + // Enlist my self val xCurrent = v[d][k + normalizer] val yCurrent = xCurrent - k path.add(Pair(xCurrent, yCurrent)) - // choose the furthest reaching endpoint that precedes me - // To list an optimal path from (0,0) to the point Vd[k] first determine - // whether it is at the end of a maximal snake following a vertical edge from Vd−1[k+1] or a horizontal edge - // from Vd−1[k−1] + // I look for the furthest reaching endpoint that precedes me, it is represented by kPrev. + // It will be an argument of the next recoursive step. + // The idea is the following: knowing my d and my k means knowing the f.r.e. that precedes me. + // Moreover, I need to enlist the points composing the snake that precedes me (it may be empty). if (d > 0) { - val kTry1 = k + 1 - val kTry2 = k - 1 - val tries = listOf(kTry1, kTry2) - for (kT in tries) { - var xPrev = v[d - 1][kT + normalizer] - var yPrev = xPrev - kT - if (kT == kTry1) { - yPrev++ - } else { - xPrev++ - } - val snake = mutableListOf>() - var skipThisRoundOfOuterLoop = false - do { - snake.add(0, Pair(xPrev, yPrev)) - if (xPrev == xCurrent && yPrev == yCurrent) { - if (snake.isNotEmpty()) { - snake.removeFirst() - for (e in snake) { - path.add(e) - } + var kPrev: Int? = null + var xSnake: Int? = null + if (k == -d || k != d && v[d][k - 1 + normalizer] < v[d][k + 1 + normalizer]) { + kPrev = k + 1 + xSnake = v[d - 1][kPrev + normalizer] + } else { + kPrev = k - 1 + xSnake = v[d - 1][kPrev + normalizer] + 1 + } + var ySnake = xSnake - k + val snake = mutableListOf>() + do { + snake.add(0, Pair(xSnake, ySnake)) + if (xSnake == xCurrent && ySnake == yCurrent) { + if (snake.isNotEmpty()) { + snake.removeFirst() + for (e in snake) { + path.add(e) } - recoursivePathFill(path, v, d - 1, kT, normalizer, dfA, dfB) - return - } - if (xPrev < dfA.length && yPrev < dfB.length && xPrev >= 0 && yPrev >= 0 && dfA[xPrev] == dfB[yPrev]) { - xPrev += 1 - yPrev += 1 - } else { - skipThisRoundOfOuterLoop = true } + recoursivePathFill(path, v, d - 1, kPrev, normalizer, dfA, dfB) + return + } + if (xSnake < dfA.length && + ySnake < dfB.length && + xSnake >= 0 && + ySnake >= 0 && + dfA[xSnake] == dfB[ySnake] + ) { + xSnake += 1 + ySnake += 1 } - while (xPrev <= xCurrent && yPrev <= yCurrent && !skipThisRoundOfOuterLoop) } + while (xSnake <= xCurrent && ySnake <= yCurrent) } - // step base, - // eventually need to build the snake from origin to the furthest reaching point with d=0 - // moreover the path is reversed so that it can be read from left to right correctly + // Step base. + // Eventually need to add diagonal edges from origin to the furthest reaching point with d=0. + // Moreover, the path is reversed so that it can be read from left to right correctly if (d == 0) { if (path.last().first != 0 && path.last().second != 0) { val last = path.last() diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt index 1d364a0676..6b9231fb87 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt @@ -14,9 +14,9 @@ class CompareDataFramesTest { Pair(1, 0), Pair(2, 0), Pair(3, 1), - Pair(4, 1), - Pair(5, 2), - Pair(5, 3), + Pair(3, 2), + Pair(4, 3), + Pair(5, 4), Pair(6, 4), Pair(7, 5), Pair(7, 6), From 352bf455ac76c6477a812b5e56c65fe52d3a6256 Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Sat, 15 Nov 2025 08:37:29 +0100 Subject: [PATCH 11/20] improving comments --- .../kotlinx/dataframe/impl/api/compareDataFrames.kt | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index f43946d188..2242ac2087 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -11,8 +11,8 @@ package org.jetbrains.kotlinx.dataframe.impl.api * increase 'd' until you succeed. * Non-diagonal moves make edit script longer, while diagonal moves do not. * - * snake: non-diagonal edge and then a possibly empty sequence of diagonal edges called a - * furthest reaching D-path endpoint: The endpoint of the longest d-path + * snake: non-diagonal edge and then a possibly empty sequence of diagonal edges + * D-path: a path starting at (0,0) that has exactly D non-diagonal edges */ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList> { // Return value @@ -21,7 +21,8 @@ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList // to show the capabilities of the algorithm var sesLength: Int? val sumOfLength = dfA.length + dfB.length - // matrix containing the furthest reaching endpoints for each d + // matrix containing the endpoint of the furthest reaching D-path ending in diagonal k + // for each d-k couple of interest val v = arrayListOf() for (d in 0..sumOfLength) { v.add(IntArray(sumOfLength * 2 + 1)) @@ -31,10 +32,13 @@ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList // 0 position is -(M+N) position in the alg's paper -> need to normalize each access to v val normalizer = sumOfLength v[0][1 + normalizer] = 0 // fitticious + // d is the number of non-diagonal edges var d = 0 while (d <= sumOfLength && !isOver) { for (k in -d..d step 2) { var x: Int? + // Each furthest reaching D-path ending in diagonal k + // is built by exploiting the furthest reaching (D-1)-path ending in k-1 or (exclusive or) k+1 if (k == -d || k != d && v[d][k - 1 + normalizer] < v[d][k + 1 + normalizer]) { x = v[d][k + 1 + normalizer] } else { @@ -77,9 +81,8 @@ internal fun recoursivePathFill( val xCurrent = v[d][k + normalizer] val yCurrent = xCurrent - k path.add(Pair(xCurrent, yCurrent)) - // I look for the furthest reaching endpoint that precedes me, it is represented by kPrev. + // I look for endpoint I was built from, it is represented by kPrev. // It will be an argument of the next recoursive step. - // The idea is the following: knowing my d and my k means knowing the f.r.e. that precedes me. // Moreover, I need to enlist the points composing the snake that precedes me (it may be empty). if (d > 0) { var kPrev: Int? = null From 0db34d8df521f1a23122290705aa21c623068ae1 Mon Sep 17 00:00:00 2001 From: CarloMariaProietti Date: Sat, 15 Nov 2025 08:42:25 +0100 Subject: [PATCH 12/20] Update ValueColumn.kt --- .../org/jetbrains/kotlinx/dataframe/columns/ValueColumn.kt | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/columns/ValueColumn.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/columns/ValueColumn.kt index 8870fb86b5..17c92a44d5 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/columns/ValueColumn.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/columns/ValueColumn.kt @@ -1,8 +1,6 @@ package org.jetbrains.kotlinx.dataframe.columns import org.jetbrains.kotlinx.dataframe.DataColumn -import org.jetbrains.kotlinx.dataframe.api.maxBy -import kotlin.getValue import kotlin.reflect.KProperty /** From 3387489e9a6689533d8d4712159e2267d976cc2f Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Sat, 15 Nov 2025 13:12:08 +0100 Subject: [PATCH 13/20] works fine with df --- .../dataframe/impl/api/compareDataFrames.kt | 21 +++--- .../dataframe/api/compareDataFrames.kt | 66 ++++++++++++++++--- 2 files changed, 68 insertions(+), 19 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 2242ac2087..9bca9a039a 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -1,5 +1,8 @@ package org.jetbrains.kotlinx.dataframe.impl.api +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.nrow + /** * dfs (will be) with same schema. Returns the path from origin to (N,M) in the edit graph. * N is dfA.nrow, M is dfB.nrow. @@ -14,13 +17,13 @@ package org.jetbrains.kotlinx.dataframe.impl.api * snake: non-diagonal edge and then a possibly empty sequence of diagonal edges * D-path: a path starting at (0,0) that has exactly D non-diagonal edges */ -internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList> { +internal fun myersDifferenceAlgorithmImpl(dfA: DataFrame, dfB: DataFrame): MutableList> { // Return value val path = mutableListOf>() // 'ses' stands for shortest edit script, next var is never returned, it is in the code // to show the capabilities of the algorithm var sesLength: Int? - val sumOfLength = dfA.length + dfB.length + val sumOfLength = dfA.nrow + dfB.nrow // matrix containing the endpoint of the furthest reaching D-path ending in diagonal k // for each d-k couple of interest val v = arrayListOf() @@ -45,7 +48,7 @@ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList x = v[d][k - 1 + normalizer] + 1 } var y = x - k - while (x < dfA.length && y < dfB.length && dfA[x] == dfB[y]) { + while (x < dfA.nrow && y < dfB.nrow && dfA[x] == dfB[y]) { x += 1 y += 1 } @@ -55,7 +58,7 @@ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList v[d + 1][k + normalizer] = x } // Edit graph was fully crossed - if (x >= dfA.length && y >= dfB.length) { + if (x >= dfA.nrow && y >= dfB.nrow) { isOver = true sesLength = d recoursivePathFill(path, v, d, k, normalizer, dfA, dfB) @@ -68,14 +71,14 @@ internal fun myersDifferenceAlgorithmImpl(dfA: String, dfB: String): MutableList return path } -internal fun recoursivePathFill( +internal fun recoursivePathFill( path: MutableList>, v: ArrayList, d: Int, k: Int, normalizer: Int, - dfA: String, - dfB: String, + dfA: DataFrame, + dfB: DataFrame, ) { // Enlist my self val xCurrent = v[d][k + normalizer] @@ -108,8 +111,8 @@ internal fun recoursivePathFill( recoursivePathFill(path, v, d - 1, kPrev, normalizer, dfA, dfB) return } - if (xSnake < dfA.length && - ySnake < dfB.length && + if (xSnake < dfA.nrow && + ySnake < dfB.nrow && xSnake >= 0 && ySnake >= 0 && dfA[xSnake] == dfB[ySnake] diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt index 6b9231fb87..b15104990f 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt @@ -1,6 +1,7 @@ package org.jetbrains.kotlinx.dataframe.api import io.kotest.matchers.shouldBe +import io.kotest.matchers.shouldNotBe import org.jetbrains.kotlinx.dataframe.impl.api.myersDifferenceAlgorithmImpl import org.junit.Test import kotlin.Pair @@ -8,7 +9,15 @@ import kotlin.Pair class CompareDataFramesTest { @Test fun `Need both to delete and insert rows, preserving some rows`() { - val path = myersDifferenceAlgorithmImpl("abcabba", "cbabac") + //dfA + val x by columnOf(0, 1, 2, 0, 1, 1, 0) + val y by columnOf("a", "b", "c", "a", "b", "b", "a") + val dfA = dataFrameOf(x, y) + //dfB + val k by columnOf(2, 1, 0, 1, 0, 2) + val z by columnOf("c", "b", "a", "b", "a", "c") + val dfB = dataFrameOf(k, z) + val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( Pair(0, 0), Pair(1, 0), @@ -25,35 +34,56 @@ class CompareDataFramesTest { @Test fun `need to do nothing`() { - val path = myersDifferenceAlgorithmImpl("aaaa", "aaaa") + //dfA + val x by columnOf(0, 0, 0) + val y by columnOf("a", "a", "a") + val dfA = dataFrameOf(x, y) + //dfB + val k by columnOf(0, 0, 0) + val z by columnOf("a", "a", "a") + val dfB = dataFrameOf(k, z) + val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( Pair(0, 0), Pair(1, 1), Pair(2, 2), Pair(3, 3), - Pair(4, 4), ) } @Test fun `need to remove each row of dfA and insert each row of dfB`() { - val path = myersDifferenceAlgorithmImpl("abcd", "efgh") + //dfA + val x by columnOf(0, 1, 2) + val y by columnOf("a", "b", "c") + val dfA = dataFrameOf(x, y) + //dfB + val k by columnOf(3, 4, 5) + val z by columnOf("d", "e", "f") + val dfB = dataFrameOf(k, z) + val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( Pair(0, 0), Pair(1, 0), Pair(2, 0), Pair(3, 0), - Pair(4, 0), - Pair(4, 1), - Pair(4, 2), - Pair(4, 3), - Pair(4, 4), + Pair(3, 1), + Pair(3, 2), + Pair(3, 3), ) } @Test fun `need to add each row`() { - val path = myersDifferenceAlgorithmImpl("", "abc") + //dfA + val x by columnOf(listOf()) + val y by columnOf(listOf()) + val dfA = dataFrameOf(x, y) + //dfB + val k by columnOf(0, 1, 2) + val z by columnOf("a", "b", "c") + val dfB = dataFrameOf(k, z) + val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( Pair(0, 0), Pair(0, 1), @@ -61,4 +91,20 @@ class CompareDataFramesTest { Pair(0, 3), ) } + + @Test + fun `compare rows`() { + val x by columnOf(1, 1) + val y by columnOf("a", "a") + val df = dataFrameOf(x, y) + df[0] shouldBe df[1] + } + + @Test + fun `compare rows2`() { + val x by columnOf(1, 2) + val y by columnOf("a", "b") + val df = dataFrameOf(x, y) + df[0] shouldNotBe df[1] + } } From e05264621303ce6cf3353298bb255c5761c3e858 Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Sat, 15 Nov 2025 16:26:13 +0100 Subject: [PATCH 14/20] compareImpl --- .../dataframe/impl/api/compareDataFrames.kt | 32 +++++++++++++++++-- .../dataframe/api/compareDataFrames.kt | 21 ++++-------- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 9bca9a039a..c8045bb72d 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -4,7 +4,32 @@ import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.nrow /** - * dfs (will be) with same schema. Returns the path from origin to (N,M) in the edit graph. + * returns a DataFrame whose rows communicate the differences between dfA and dfB + */ +internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): DataFrame<*> { + val shortestEditScript = myersDifferenceAlgorithmImpl(dfA, dfB) + var x: Int? + var y: Int? + var xPrev: Int? + var yPrev: Int? + + for(i in 1 until shortestEditScript.size) { + x=shortestEditScript[i].first + y=shortestEditScript[i].second + xPrev=shortestEditScript[i-1].first + yPrev=shortestEditScript[i-1].second + when { + xPrev+1==x&&yPrev+1==y -> //row in position 'x' of dfA was not removed + + xPrev+1==x -> //row in position 'x' of dfA was removed + + yPrev+1==y -> //row in position 'y' of dfB was inserted after row in position 'x' of dfA + } + } +} + +/** + * dfs with same schema. Returns the path from origin to (N,M) in the edit graph. * N is dfA.nrow, M is dfB.nrow. * Knowing this path is knowing the differences between dfA and dfB * and the shortest edit script to get B from A. @@ -17,7 +42,7 @@ import org.jetbrains.kotlinx.dataframe.nrow * snake: non-diagonal edge and then a possibly empty sequence of diagonal edges * D-path: a path starting at (0,0) that has exactly D non-diagonal edges */ -internal fun myersDifferenceAlgorithmImpl(dfA: DataFrame, dfB: DataFrame): MutableList> { +internal fun myersDifferenceAlgorithmImpl(dfA: DataFrame, dfB: DataFrame): List> { // Return value val path = mutableListOf>() // 'ses' stands for shortest edit script, next var is never returned, it is in the code @@ -68,7 +93,8 @@ internal fun myersDifferenceAlgorithmImpl(dfA: DataFrame, dfB: DataFrame< // try with a longer edit script d++ } - return path + val immutablePath = path.toList() + return immutablePath } internal fun recoursivePathFill( diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt index b15104990f..808743de0e 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt @@ -1,7 +1,6 @@ package org.jetbrains.kotlinx.dataframe.api import io.kotest.matchers.shouldBe -import io.kotest.matchers.shouldNotBe import org.jetbrains.kotlinx.dataframe.impl.api.myersDifferenceAlgorithmImpl import org.junit.Test import kotlin.Pair @@ -93,18 +92,12 @@ class CompareDataFramesTest { } @Test - fun `compare rows`() { - val x by columnOf(1, 1) - val y by columnOf("a", "a") - val df = dataFrameOf(x, y) - df[0] shouldBe df[1] - } - - @Test - fun `compare rows2`() { - val x by columnOf(1, 2) - val y by columnOf("a", "b") - val df = dataFrameOf(x, y) - df[0] shouldNotBe df[1] + fun `describe`() { + //dfA + val x by columnOf(0, 1, 2, 0, 1, 1, 0) + val y by columnOf("a", "b", "c", "a", "b", "b", "a") + val dfA = dataFrameOf(x, y) + val r = dfA.describe() + r shouldBe emptyDataFrame() } } From 4db19bf82d2a62fc254a662571e4a1e8b320cf25 Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Sun, 16 Nov 2025 16:30:09 +0100 Subject: [PATCH 15/20] compare is ready to use --- .../dataframe/impl/api/compareDataFrames.kt | 51 ++++-- .../dataframe/api/compareDataFrames.kt | 170 +++++++++++++----- 2 files changed, 164 insertions(+), 57 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index c8045bb72d..4aaca3cab9 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -1,31 +1,58 @@ package org.jetbrains.kotlinx.dataframe.impl.api import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.DataRowSchema +import org.jetbrains.kotlinx.dataframe.api.concat +import org.jetbrains.kotlinx.dataframe.api.dataFrameOf +import org.jetbrains.kotlinx.dataframe.api.emptyDataFrame import org.jetbrains.kotlinx.dataframe.nrow +internal class ComparisonDescription( + val rowAtIndex: Int, + val of: String, + val wasRemoved: Boolean?, + val wasInserted: Boolean?, + val afterRow: Int?, +) : DataRowSchema + /** - * returns a DataFrame whose rows communicate the differences between dfA and dfB + * Returns a DataFrame whose rows explain the differences between dfA and dfB. + * One must think of the set of commands in a script as being executed simultaneously */ -internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): DataFrame<*> { +internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): DataFrame { + var comparisonDf = emptyDataFrame() + // make the comparison exploiting Myers difference algorithm val shortestEditScript = myersDifferenceAlgorithmImpl(dfA, dfB) var x: Int? var y: Int? var xPrev: Int? var yPrev: Int? - - for(i in 1 until shortestEditScript.size) { - x=shortestEditScript[i].first - y=shortestEditScript[i].second - xPrev=shortestEditScript[i-1].first - yPrev=shortestEditScript[i-1].second + for (i in 1 until shortestEditScript.size) { + x = shortestEditScript[i].first + y = shortestEditScript[i].second + xPrev = shortestEditScript[i - 1].first + yPrev = shortestEditScript[i - 1].second when { - xPrev+1==x&&yPrev+1==y -> //row in position 'x' of dfA was not removed - - xPrev+1==x -> //row in position 'x' of dfA was removed + // row in position 'x' of dfA was removed + xPrev + 1 == x && yPrev + 1 != y -> { + comparisonDf = comparisonDf.concat( + dataFrameOf + (ComparisonDescription(x-1, "dfA", true, null, null)), + ) + } - yPrev+1==y -> //row in position 'y' of dfB was inserted after row in position 'x' of dfA + // row in position 'y' of dfB was inserted after row in position 'x' of dfA + yPrev + 1 == y && xPrev + 1 != x -> { + comparisonDf = comparisonDf.concat( + dataFrameOf( + ComparisonDescription + (y-1, "dfB", null, true, x-1), + ), + ) + } } } + return comparisonDf } /** diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt index 808743de0e..721d1475fc 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt @@ -1,21 +1,109 @@ package org.jetbrains.kotlinx.dataframe.api import io.kotest.matchers.shouldBe +import org.jetbrains.kotlinx.dataframe.impl.api.ComparisonDescription +import org.jetbrains.kotlinx.dataframe.impl.api.compareDataFramesImpl import org.jetbrains.kotlinx.dataframe.impl.api.myersDifferenceAlgorithmImpl import org.junit.Test import kotlin.Pair +private class SchemaForThisTest(val integer: Int, val string: String) : DataRowSchema + class CompareDataFramesTest { + + // compareDataFrames region + @Test fun `Need both to delete and insert rows, preserving some rows`() { - //dfA - val x by columnOf(0, 1, 2, 0, 1, 1, 0) - val y by columnOf("a", "b", "c", "a", "b", "b", "a") - val dfA = dataFrameOf(x, y) - //dfB - val k by columnOf(2, 1, 0, 1, 0, 2) - val z by columnOf("c", "b", "a", "b", "a", "c") - val dfB = dataFrameOf(k, z) + val dfA = dataFrameOf( + SchemaForThisTest(0, "a"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(2, "c"), + SchemaForThisTest(0, "a"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(0, "a"), + ) + val dfB = dataFrameOf( + SchemaForThisTest(2, "c"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(0, "a"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(0, "a"), + SchemaForThisTest(2, "c"), + ) + val comparison = compareDataFramesImpl(dfA, dfB) + comparison shouldBe dataFrameOf( + ComparisonDescription(0, "dfA", true, null, null), + ComparisonDescription(1, "dfA", true, null, null), + ComparisonDescription(1, "dfB", null, true, 2), + ComparisonDescription(5, "dfA", true, null, null), + ComparisonDescription(5, "dfB", null, true, 6), + ) + } + + @Test + fun `need to do nothing`() { + val dfA = dataFrameOf( + SchemaForThisTest(0, "a"), + SchemaForThisTest(0, "a"), + SchemaForThisTest(0, "a"), + ) + val dfB = dataFrameOf( + SchemaForThisTest(0, "a"), + SchemaForThisTest(0, "a"), + SchemaForThisTest(0, "a"), + ) + val comparison = compareDataFramesImpl(dfA, dfB) + comparison shouldBe emptyDataFrame() + } + + @Test + fun `need to remove each row of dfA and insert each row of dfB`() { + val dfA = dataFrameOf( + SchemaForThisTest(0, "a"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(2, "c"), + ) + val dfB = dataFrameOf( + SchemaForThisTest(3, "d"), + SchemaForThisTest(4, "e"), + SchemaForThisTest(5, "f"), + ) + val comparison = compareDataFramesImpl(dfA, dfB) + comparison shouldBe dataFrameOf( + ComparisonDescription(0, "dfA", true, null, null), + ComparisonDescription(1, "dfA", true, null, null), + ComparisonDescription(2, "dfA", true, null, null), + ComparisonDescription(0, "dfB", null, true, 2), + ComparisonDescription(1, "dfB", null, true, 2), + ComparisonDescription(2, "dfB", null, true, 2), + ) + } + + // end region + + // Myers algorithm region + + @Test + fun `Need both to delete and insert rows, preserving some rows, Myers algorithm`() { + val dfA = dataFrameOf( + SchemaForThisTest(0, "a"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(2, "c"), + SchemaForThisTest(0, "a"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(0, "a"), + ) + val dfB = dataFrameOf( + SchemaForThisTest(2, "c"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(0, "a"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(0, "a"), + SchemaForThisTest(2, "c"), + ) val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( Pair(0, 0), @@ -32,15 +120,17 @@ class CompareDataFramesTest { } @Test - fun `need to do nothing`() { - //dfA - val x by columnOf(0, 0, 0) - val y by columnOf("a", "a", "a") - val dfA = dataFrameOf(x, y) - //dfB - val k by columnOf(0, 0, 0) - val z by columnOf("a", "a", "a") - val dfB = dataFrameOf(k, z) + fun `need to do nothing, Myers algorithm`() { + val dfA = dataFrameOf( + SchemaForThisTest(0, "a"), + SchemaForThisTest(0, "a"), + SchemaForThisTest(0, "a"), + ) + val dfB = dataFrameOf( + SchemaForThisTest(0, "a"), + SchemaForThisTest(0, "a"), + SchemaForThisTest(0, "a"), + ) val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( Pair(0, 0), @@ -51,15 +141,17 @@ class CompareDataFramesTest { } @Test - fun `need to remove each row of dfA and insert each row of dfB`() { - //dfA - val x by columnOf(0, 1, 2) - val y by columnOf("a", "b", "c") - val dfA = dataFrameOf(x, y) - //dfB - val k by columnOf(3, 4, 5) - val z by columnOf("d", "e", "f") - val dfB = dataFrameOf(k, z) + fun `need to remove each row of dfA and insert each row of dfB, Myers Algorithm`() { + val dfA = dataFrameOf( + SchemaForThisTest(0, "a"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(2, "c"), + ) + val dfB = dataFrameOf( + SchemaForThisTest(3, "d"), + SchemaForThisTest(4, "e"), + SchemaForThisTest(5, "f"), + ) val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( Pair(0, 0), @@ -73,15 +165,13 @@ class CompareDataFramesTest { } @Test - fun `need to add each row`() { - //dfA - val x by columnOf(listOf()) - val y by columnOf(listOf()) - val dfA = dataFrameOf(x, y) - //dfB - val k by columnOf(0, 1, 2) - val z by columnOf("a", "b", "c") - val dfB = dataFrameOf(k, z) + fun `need to add each row, Myers algorithm`() { + val dfA = emptyDataFrame() + val dfB = dataFrameOf( + SchemaForThisTest(0, "a"), + SchemaForThisTest(1, "b"), + SchemaForThisTest(2, "c"), + ) val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( Pair(0, 0), @@ -90,14 +180,4 @@ class CompareDataFramesTest { Pair(0, 3), ) } - - @Test - fun `describe`() { - //dfA - val x by columnOf(0, 1, 2, 0, 1, 1, 0) - val y by columnOf("a", "b", "c", "a", "b", "b", "a") - val dfA = dataFrameOf(x, y) - val r = dfA.describe() - r shouldBe emptyDataFrame() - } } From b4be510f57b5af4d71a003f3fb501551502c8ea7 Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Sun, 16 Nov 2025 19:50:59 +0100 Subject: [PATCH 16/20] ready for review --- .../kotlinx/dataframe/impl/api/compareDataFrames.kt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 4aaca3cab9..5bed35482d 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -21,7 +21,7 @@ internal class ComparisonDescription( */ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): DataFrame { var comparisonDf = emptyDataFrame() - // make the comparison exploiting Myers difference algorithm + // compare by exploiting Myers difference algorithm val shortestEditScript = myersDifferenceAlgorithmImpl(dfA, dfB) var x: Int? var y: Int? @@ -33,20 +33,20 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da xPrev = shortestEditScript[i - 1].first yPrev = shortestEditScript[i - 1].second when { - // row in position 'x' of dfA was removed + // row at index 'x-1' of dfA was removed xPrev + 1 == x && yPrev + 1 != y -> { comparisonDf = comparisonDf.concat( dataFrameOf - (ComparisonDescription(x-1, "dfA", true, null, null)), + (ComparisonDescription(x - 1, "dfA", true, null, null)), ) } - // row in position 'y' of dfB was inserted after row in position 'x' of dfA + // row at index 'y-1' of dfB was inserted after row in position 'x-1' of dfA yPrev + 1 == y && xPrev + 1 != x -> { comparisonDf = comparisonDf.concat( dataFrameOf( ComparisonDescription - (y-1, "dfB", null, true, x-1), + (y - 1, "dfB", null, true, x - 1), ), ) } @@ -56,7 +56,7 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da } /** - * dfs with same schema. Returns the path from origin to (N,M) in the edit graph. + * dfs with same schema. Returns an optimal path from origin to (N,M) in the edit graph. * N is dfA.nrow, M is dfB.nrow. * Knowing this path is knowing the differences between dfA and dfB * and the shortest edit script to get B from A. From 483163db25059696cdcca2d66ef294690644f5fe Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Fri, 28 Nov 2025 16:03:10 +0100 Subject: [PATCH 17/20] changing code accoring to the review, still have to add enums --- .../dataframe/impl/api/compareDataFrames.kt | 50 +++++++++++-------- .../dataframe/api/compareDataFrames.kt | 4 +- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 5bed35482d..8e2081fbbb 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -1,17 +1,19 @@ package org.jetbrains.kotlinx.dataframe.impl.api import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.annotations.DataSchema import org.jetbrains.kotlinx.dataframe.api.DataRowSchema import org.jetbrains.kotlinx.dataframe.api.concat import org.jetbrains.kotlinx.dataframe.api.dataFrameOf import org.jetbrains.kotlinx.dataframe.api.emptyDataFrame import org.jetbrains.kotlinx.dataframe.nrow +@DataSchema internal class ComparisonDescription( val rowAtIndex: Int, val of: String, val wasRemoved: Boolean?, - val wasInserted: Boolean?, + val insertedAfterRow: Boolean?, val afterRow: Int?, ) : DataRowSchema @@ -23,15 +25,11 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da var comparisonDf = emptyDataFrame() // compare by exploiting Myers difference algorithm val shortestEditScript = myersDifferenceAlgorithmImpl(dfA, dfB) - var x: Int? - var y: Int? - var xPrev: Int? - var yPrev: Int? for (i in 1 until shortestEditScript.size) { - x = shortestEditScript[i].first - y = shortestEditScript[i].second - xPrev = shortestEditScript[i - 1].first - yPrev = shortestEditScript[i - 1].second + val x = shortestEditScript[i].first + val y = shortestEditScript[i].second + val xPrev = shortestEditScript[i - 1].first + val yPrev = shortestEditScript[i - 1].second when { // row at index 'x-1' of dfA was removed xPrev + 1 == x && yPrev + 1 != y -> { @@ -43,10 +41,18 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da // row at index 'y-1' of dfB was inserted after row in position 'x-1' of dfA yPrev + 1 == y && xPrev + 1 != x -> { + val indexOfInsertedRow = y - 1 + val sourceDfOfInsertedRow = "dfB" + val indexOfReferenceRow = x - 1 comparisonDf = comparisonDf.concat( dataFrameOf( - ComparisonDescription - (y - 1, "dfB", null, true, x - 1), + ComparisonDescription( + indexOfInsertedRow, + sourceDfOfInsertedRow, + null, + true, + indexOfReferenceRow, + ), ), ) } @@ -74,24 +80,24 @@ internal fun myersDifferenceAlgorithmImpl(dfA: DataFrame, dfB: DataFrame< val path = mutableListOf>() // 'ses' stands for shortest edit script, next var is never returned, it is in the code // to show the capabilities of the algorithm - var sesLength: Int? + var sesLength: Int val sumOfLength = dfA.nrow + dfB.nrow // matrix containing the endpoint of the furthest reaching D-path ending in diagonal k // for each d-k couple of interest - val v = arrayListOf() - for (d in 0..sumOfLength) { + val v = mutableListOf() + repeat(sumOfLength + 1) { v.add(IntArray(sumOfLength * 2 + 1)) } var isOver = false // starting the algorithm // 0 position is -(M+N) position in the alg's paper -> need to normalize each access to v val normalizer = sumOfLength - v[0][1 + normalizer] = 0 // fitticious + v[0][1 + normalizer] = 0 // fictitious // d is the number of non-diagonal edges var d = 0 while (d <= sumOfLength && !isOver) { for (k in -d..d step 2) { - var x: Int? + var x: Int // Each furthest reaching D-path ending in diagonal k // is built by exploiting the furthest reaching (D-1)-path ending in k-1 or (exclusive or) k+1 if (k == -d || k != d && v[d][k - 1 + normalizer] < v[d][k + 1 + normalizer]) { @@ -113,7 +119,7 @@ internal fun myersDifferenceAlgorithmImpl(dfA: DataFrame, dfB: DataFrame< if (x >= dfA.nrow && y >= dfB.nrow) { isOver = true sesLength = d - recoursivePathFill(path, v, d, k, normalizer, dfA, dfB) + tailrec(path, v, d, k, normalizer, dfA, dfB) break } } @@ -124,9 +130,9 @@ internal fun myersDifferenceAlgorithmImpl(dfA: DataFrame, dfB: DataFrame< return immutablePath } -internal fun recoursivePathFill( +internal fun tailrec( path: MutableList>, - v: ArrayList, + v: MutableList, d: Int, k: Int, normalizer: Int, @@ -141,8 +147,8 @@ internal fun recoursivePathFill( // It will be an argument of the next recoursive step. // Moreover, I need to enlist the points composing the snake that precedes me (it may be empty). if (d > 0) { - var kPrev: Int? = null - var xSnake: Int? = null + var kPrev: Int + var xSnake: Int if (k == -d || k != d && v[d][k - 1 + normalizer] < v[d][k + 1 + normalizer]) { kPrev = k + 1 xSnake = v[d - 1][kPrev + normalizer] @@ -161,7 +167,7 @@ internal fun recoursivePathFill( path.add(e) } } - recoursivePathFill(path, v, d - 1, kPrev, normalizer, dfA, dfB) + tailrec(path, v, d - 1, kPrev, normalizer, dfA, dfB) return } if (xSnake < dfA.nrow && diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt index 721d1475fc..d42c879861 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt @@ -1,13 +1,15 @@ package org.jetbrains.kotlinx.dataframe.api import io.kotest.matchers.shouldBe +import org.jetbrains.kotlinx.dataframe.annotations.DataSchema import org.jetbrains.kotlinx.dataframe.impl.api.ComparisonDescription import org.jetbrains.kotlinx.dataframe.impl.api.compareDataFramesImpl import org.jetbrains.kotlinx.dataframe.impl.api.myersDifferenceAlgorithmImpl import org.junit.Test import kotlin.Pair -private class SchemaForThisTest(val integer: Int, val string: String) : DataRowSchema +@DataSchema +internal class SchemaForThisTest(val integer: Int, val string: String) : DataRowSchema class CompareDataFramesTest { From 1ebe865286db39cbf4c8799985a9f0b8b7737d54 Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Sat, 29 Nov 2025 12:14:58 +0100 Subject: [PATCH 18/20] first review accompished --- .../dataframe/impl/api/compareDataFrames.kt | 35 +++-- .../dataframe/api/compareDataFrames.kt | 134 +++++++++--------- 2 files changed, 95 insertions(+), 74 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 8e2081fbbb..9caba105f4 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -11,12 +11,22 @@ import org.jetbrains.kotlinx.dataframe.nrow @DataSchema internal class ComparisonDescription( val rowAtIndex: Int, - val of: String, - val wasRemoved: Boolean?, - val insertedAfterRow: Boolean?, + val of: DataFrameOfComparison, + val wasRemoved: RowOfComparison?, + val insertedAfterRow: RowOfComparison?, val afterRow: Int?, ) : DataRowSchema +internal enum class DataFrameOfComparison { + DFA, + DFB, +} + +internal enum class RowOfComparison { + WAS_INSERTED_AFTER_ROW, + WAS_REMOVED, +} + /** * Returns a DataFrame whose rows explain the differences between dfA and dfB. * One must think of the set of commands in a script as being executed simultaneously @@ -33,16 +43,25 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da when { // row at index 'x-1' of dfA was removed xPrev + 1 == x && yPrev + 1 != y -> { + val indexOfRemovedRow = x - 1 + val sourceDfOfRemovedRow = DataFrameOfComparison.DFA comparisonDf = comparisonDf.concat( - dataFrameOf - (ComparisonDescription(x - 1, "dfA", true, null, null)), + dataFrameOf( + ComparisonDescription( + indexOfRemovedRow, + sourceDfOfRemovedRow, + RowOfComparison.WAS_REMOVED, + null, + null, + ), + ), ) } // row at index 'y-1' of dfB was inserted after row in position 'x-1' of dfA yPrev + 1 == y && xPrev + 1 != x -> { val indexOfInsertedRow = y - 1 - val sourceDfOfInsertedRow = "dfB" + val sourceDfOfInsertedRow = DataFrameOfComparison.DFB val indexOfReferenceRow = x - 1 comparisonDf = comparisonDf.concat( dataFrameOf( @@ -50,7 +69,7 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da indexOfInsertedRow, sourceDfOfInsertedRow, null, - true, + RowOfComparison.WAS_INSERTED_AFTER_ROW, indexOfReferenceRow, ), ), @@ -144,7 +163,7 @@ internal fun tailrec( val yCurrent = xCurrent - k path.add(Pair(xCurrent, yCurrent)) // I look for endpoint I was built from, it is represented by kPrev. - // It will be an argument of the next recoursive step. + // It will be an argument of the next recursive step. // Moreover, I need to enlist the points composing the snake that precedes me (it may be empty). if (d > 0) { var kPrev: Int diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt index d42c879861..873bd6331e 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt @@ -3,13 +3,15 @@ package org.jetbrains.kotlinx.dataframe.api import io.kotest.matchers.shouldBe import org.jetbrains.kotlinx.dataframe.annotations.DataSchema import org.jetbrains.kotlinx.dataframe.impl.api.ComparisonDescription +import org.jetbrains.kotlinx.dataframe.impl.api.DataFrameOfComparison +import org.jetbrains.kotlinx.dataframe.impl.api.RowOfComparison import org.jetbrains.kotlinx.dataframe.impl.api.compareDataFramesImpl import org.jetbrains.kotlinx.dataframe.impl.api.myersDifferenceAlgorithmImpl import org.junit.Test import kotlin.Pair @DataSchema -internal class SchemaForThisTest(val integer: Int, val string: String) : DataRowSchema +internal class SchemaForCompareDfTest(val integer: Int, val string: String) : DataRowSchema class CompareDataFramesTest { @@ -18,43 +20,43 @@ class CompareDataFramesTest { @Test fun `Need both to delete and insert rows, preserving some rows`() { val dfA = dataFrameOf( - SchemaForThisTest(0, "a"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(2, "c"), - SchemaForThisTest(0, "a"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(0, "a"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(2, "c"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(0, "a"), ) val dfB = dataFrameOf( - SchemaForThisTest(2, "c"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(0, "a"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(0, "a"), - SchemaForThisTest(2, "c"), + SchemaForCompareDfTest(2, "c"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(2, "c"), ) val comparison = compareDataFramesImpl(dfA, dfB) comparison shouldBe dataFrameOf( - ComparisonDescription(0, "dfA", true, null, null), - ComparisonDescription(1, "dfA", true, null, null), - ComparisonDescription(1, "dfB", null, true, 2), - ComparisonDescription(5, "dfA", true, null, null), - ComparisonDescription(5, "dfB", null, true, 6), + ComparisonDescription(0, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), + ComparisonDescription(1, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), + ComparisonDescription(1, DataFrameOfComparison.DFB, null, RowOfComparison.WAS_INSERTED_AFTER_ROW, 2), + ComparisonDescription(5, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), + ComparisonDescription(5, DataFrameOfComparison.DFB, null, RowOfComparison.WAS_INSERTED_AFTER_ROW, 6), ) } @Test fun `need to do nothing`() { val dfA = dataFrameOf( - SchemaForThisTest(0, "a"), - SchemaForThisTest(0, "a"), - SchemaForThisTest(0, "a"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(0, "a"), ) val dfB = dataFrameOf( - SchemaForThisTest(0, "a"), - SchemaForThisTest(0, "a"), - SchemaForThisTest(0, "a"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(0, "a"), ) val comparison = compareDataFramesImpl(dfA, dfB) comparison shouldBe emptyDataFrame() @@ -63,23 +65,23 @@ class CompareDataFramesTest { @Test fun `need to remove each row of dfA and insert each row of dfB`() { val dfA = dataFrameOf( - SchemaForThisTest(0, "a"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(2, "c"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(2, "c"), ) val dfB = dataFrameOf( - SchemaForThisTest(3, "d"), - SchemaForThisTest(4, "e"), - SchemaForThisTest(5, "f"), + SchemaForCompareDfTest(3, "d"), + SchemaForCompareDfTest(4, "e"), + SchemaForCompareDfTest(5, "f"), ) val comparison = compareDataFramesImpl(dfA, dfB) comparison shouldBe dataFrameOf( - ComparisonDescription(0, "dfA", true, null, null), - ComparisonDescription(1, "dfA", true, null, null), - ComparisonDescription(2, "dfA", true, null, null), - ComparisonDescription(0, "dfB", null, true, 2), - ComparisonDescription(1, "dfB", null, true, 2), - ComparisonDescription(2, "dfB", null, true, 2), + ComparisonDescription(0, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), + ComparisonDescription(1, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), + ComparisonDescription(2, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), + ComparisonDescription(0, DataFrameOfComparison.DFB, null, RowOfComparison.WAS_INSERTED_AFTER_ROW, 2), + ComparisonDescription(1, DataFrameOfComparison.DFB, null, RowOfComparison.WAS_INSERTED_AFTER_ROW, 2), + ComparisonDescription(2, DataFrameOfComparison.DFB, null, RowOfComparison.WAS_INSERTED_AFTER_ROW, 2), ) } @@ -90,21 +92,21 @@ class CompareDataFramesTest { @Test fun `Need both to delete and insert rows, preserving some rows, Myers algorithm`() { val dfA = dataFrameOf( - SchemaForThisTest(0, "a"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(2, "c"), - SchemaForThisTest(0, "a"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(0, "a"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(2, "c"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(0, "a"), ) val dfB = dataFrameOf( - SchemaForThisTest(2, "c"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(0, "a"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(0, "a"), - SchemaForThisTest(2, "c"), + SchemaForCompareDfTest(2, "c"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(2, "c"), ) val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( @@ -124,14 +126,14 @@ class CompareDataFramesTest { @Test fun `need to do nothing, Myers algorithm`() { val dfA = dataFrameOf( - SchemaForThisTest(0, "a"), - SchemaForThisTest(0, "a"), - SchemaForThisTest(0, "a"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(0, "a"), ) val dfB = dataFrameOf( - SchemaForThisTest(0, "a"), - SchemaForThisTest(0, "a"), - SchemaForThisTest(0, "a"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(0, "a"), ) val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( @@ -145,14 +147,14 @@ class CompareDataFramesTest { @Test fun `need to remove each row of dfA and insert each row of dfB, Myers Algorithm`() { val dfA = dataFrameOf( - SchemaForThisTest(0, "a"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(2, "c"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(2, "c"), ) val dfB = dataFrameOf( - SchemaForThisTest(3, "d"), - SchemaForThisTest(4, "e"), - SchemaForThisTest(5, "f"), + SchemaForCompareDfTest(3, "d"), + SchemaForCompareDfTest(4, "e"), + SchemaForCompareDfTest(5, "f"), ) val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( @@ -168,11 +170,11 @@ class CompareDataFramesTest { @Test fun `need to add each row, Myers algorithm`() { - val dfA = emptyDataFrame() + val dfA = emptyDataFrame() val dfB = dataFrameOf( - SchemaForThisTest(0, "a"), - SchemaForThisTest(1, "b"), - SchemaForThisTest(2, "c"), + SchemaForCompareDfTest(0, "a"), + SchemaForCompareDfTest(1, "b"), + SchemaForCompareDfTest(2, "c"), ) val path = myersDifferenceAlgorithmImpl(dfA, dfB) path shouldBe listOf( From c35ef21a7716bba18e30ec64ca6f35fc0ef25ce9 Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Mon, 1 Dec 2025 16:07:04 +0100 Subject: [PATCH 19/20] fix enum --- .../dataframe/impl/api/compareDataFrames.kt | 21 ++++++++---------- .../dataframe/api/compareDataFrames.kt | 22 +++++++++---------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 9caba105f4..3496d85771 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -12,9 +12,8 @@ import org.jetbrains.kotlinx.dataframe.nrow internal class ComparisonDescription( val rowAtIndex: Int, val of: DataFrameOfComparison, - val wasRemoved: RowOfComparison?, - val insertedAfterRow: RowOfComparison?, - val afterRow: Int?, + val modification: RowOfComparison, + val insertedAfterRow: Int?, ) : DataRowSchema internal enum class DataFrameOfComparison { @@ -23,8 +22,8 @@ internal enum class DataFrameOfComparison { } internal enum class RowOfComparison { - WAS_INSERTED_AFTER_ROW, - WAS_REMOVED, + INSERTED, + REMOVED, } /** @@ -50,8 +49,7 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da ComparisonDescription( indexOfRemovedRow, sourceDfOfRemovedRow, - RowOfComparison.WAS_REMOVED, - null, + RowOfComparison.REMOVED, null, ), ), @@ -68,8 +66,7 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da ComparisonDescription( indexOfInsertedRow, sourceDfOfInsertedRow, - null, - RowOfComparison.WAS_INSERTED_AFTER_ROW, + RowOfComparison.INSERTED, indexOfReferenceRow, ), ), @@ -138,7 +135,7 @@ internal fun myersDifferenceAlgorithmImpl(dfA: DataFrame, dfB: DataFrame< if (x >= dfA.nrow && y >= dfB.nrow) { isOver = true sesLength = d - tailrec(path, v, d, k, normalizer, dfA, dfB) + recursivePathFill(path, v, d, k, normalizer, dfA, dfB) break } } @@ -149,7 +146,7 @@ internal fun myersDifferenceAlgorithmImpl(dfA: DataFrame, dfB: DataFrame< return immutablePath } -internal fun tailrec( +internal tailrec fun recursivePathFill( path: MutableList>, v: MutableList, d: Int, @@ -186,7 +183,7 @@ internal fun tailrec( path.add(e) } } - tailrec(path, v, d - 1, kPrev, normalizer, dfA, dfB) + recursivePathFill(path, v, d - 1, kPrev, normalizer, dfA, dfB) return } if (xSnake < dfA.nrow && diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt index 873bd6331e..8d009b409c 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt @@ -38,11 +38,11 @@ class CompareDataFramesTest { ) val comparison = compareDataFramesImpl(dfA, dfB) comparison shouldBe dataFrameOf( - ComparisonDescription(0, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), - ComparisonDescription(1, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), - ComparisonDescription(1, DataFrameOfComparison.DFB, null, RowOfComparison.WAS_INSERTED_AFTER_ROW, 2), - ComparisonDescription(5, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), - ComparisonDescription(5, DataFrameOfComparison.DFB, null, RowOfComparison.WAS_INSERTED_AFTER_ROW, 6), + ComparisonDescription(0, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), + ComparisonDescription(1, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), + ComparisonDescription(1, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2), + ComparisonDescription(5, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), + ComparisonDescription(5, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 6), ) } @@ -76,12 +76,12 @@ class CompareDataFramesTest { ) val comparison = compareDataFramesImpl(dfA, dfB) comparison shouldBe dataFrameOf( - ComparisonDescription(0, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), - ComparisonDescription(1, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), - ComparisonDescription(2, DataFrameOfComparison.DFA, RowOfComparison.WAS_REMOVED, null, null), - ComparisonDescription(0, DataFrameOfComparison.DFB, null, RowOfComparison.WAS_INSERTED_AFTER_ROW, 2), - ComparisonDescription(1, DataFrameOfComparison.DFB, null, RowOfComparison.WAS_INSERTED_AFTER_ROW, 2), - ComparisonDescription(2, DataFrameOfComparison.DFB, null, RowOfComparison.WAS_INSERTED_AFTER_ROW, 2), + ComparisonDescription(0, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), + ComparisonDescription(1, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), + ComparisonDescription(2, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), + ComparisonDescription(0, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2), + ComparisonDescription(1, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2), + ComparisonDescription(2, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2), ) } From f85a2f71c5f94764ec77d52b91e66847bfdf98b9 Mon Sep 17 00:00:00 2001 From: Carlo Maria Proietti Date: Mon, 1 Dec 2025 16:49:29 +0100 Subject: [PATCH 20/20] output includes modified row's content --- .../dataframe/impl/api/compareDataFrames.kt | 12 +++++++--- .../dataframe/api/compareDataFrames.kt | 22 +++++++++---------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt index 3496d85771..078aacc900 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt @@ -1,6 +1,7 @@ package org.jetbrains.kotlinx.dataframe.impl.api import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.annotations.DataSchema import org.jetbrains.kotlinx.dataframe.api.DataRowSchema import org.jetbrains.kotlinx.dataframe.api.concat @@ -9,11 +10,12 @@ import org.jetbrains.kotlinx.dataframe.api.emptyDataFrame import org.jetbrains.kotlinx.dataframe.nrow @DataSchema -internal class ComparisonDescription( +internal class ComparisonDescription( val rowAtIndex: Int, val of: DataFrameOfComparison, val modification: RowOfComparison, val insertedAfterRow: Int?, + val modifiedRowContent: DataRow, ) : DataRowSchema internal enum class DataFrameOfComparison { @@ -30,8 +32,8 @@ internal enum class RowOfComparison { * Returns a DataFrame whose rows explain the differences between dfA and dfB. * One must think of the set of commands in a script as being executed simultaneously */ -internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): DataFrame { - var comparisonDf = emptyDataFrame() +internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): DataFrame> { + var comparisonDf = emptyDataFrame>() // compare by exploiting Myers difference algorithm val shortestEditScript = myersDifferenceAlgorithmImpl(dfA, dfB) for (i in 1 until shortestEditScript.size) { @@ -44,6 +46,7 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da xPrev + 1 == x && yPrev + 1 != y -> { val indexOfRemovedRow = x - 1 val sourceDfOfRemovedRow = DataFrameOfComparison.DFA + val rowContent = dfA[indexOfRemovedRow] comparisonDf = comparisonDf.concat( dataFrameOf( ComparisonDescription( @@ -51,6 +54,7 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da sourceDfOfRemovedRow, RowOfComparison.REMOVED, null, + rowContent, ), ), ) @@ -61,6 +65,7 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da val indexOfInsertedRow = y - 1 val sourceDfOfInsertedRow = DataFrameOfComparison.DFB val indexOfReferenceRow = x - 1 + val rowContent = dfB[indexOfInsertedRow] comparisonDf = comparisonDf.concat( dataFrameOf( ComparisonDescription( @@ -68,6 +73,7 @@ internal fun compareDataFramesImpl(dfA: DataFrame, dfB: DataFrame): Da sourceDfOfInsertedRow, RowOfComparison.INSERTED, indexOfReferenceRow, + rowContent, ), ), ) diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt index 8d009b409c..97313cc00f 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt @@ -38,11 +38,11 @@ class CompareDataFramesTest { ) val comparison = compareDataFramesImpl(dfA, dfB) comparison shouldBe dataFrameOf( - ComparisonDescription(0, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), - ComparisonDescription(1, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), - ComparisonDescription(1, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2), - ComparisonDescription(5, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), - ComparisonDescription(5, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 6), + ComparisonDescription(0, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null, dfA[0]), + ComparisonDescription(1, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null, dfA[1]), + ComparisonDescription(1, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2, dfB[1]), + ComparisonDescription(5, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null, dfA[5]), + ComparisonDescription(5, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 6, dfB[5]), ) } @@ -76,12 +76,12 @@ class CompareDataFramesTest { ) val comparison = compareDataFramesImpl(dfA, dfB) comparison shouldBe dataFrameOf( - ComparisonDescription(0, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), - ComparisonDescription(1, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), - ComparisonDescription(2, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null), - ComparisonDescription(0, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2), - ComparisonDescription(1, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2), - ComparisonDescription(2, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2), + ComparisonDescription(0, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null, dfA[0]), + ComparisonDescription(1, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null, dfA[1]), + ComparisonDescription(2, DataFrameOfComparison.DFA, RowOfComparison.REMOVED, null, dfA[2]), + ComparisonDescription(0, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2, dfB[0]), + ComparisonDescription(1, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2, dfB[1]), + ComparisonDescription(2, DataFrameOfComparison.DFB, RowOfComparison.INSERTED, 2, dfB[2]), ) }