2
\$\begingroup\$

In the question of how to reverse a string in Java, a comment mentioned that combining Unicode code points need to be taken into account.

The below code works as intended for all test cases I tried. Probably there are some edge cases in other scripts and languages I do not know. I'd like to learn about these, as well as any coding style issues.

package de.roland_illig.strrev

import com.ibm.icu.lang.UCharacter
import org.assertj.core.api.Assertions.assertThat
import org.junit.jupiter.api.Test

/**
 * Returns the reversed string, keeping clusters of combining code points
 * (such as German umlauts or Arabic tashkīl) together.
 */
fun String.reverse(): String {

    fun isLamAlef(cluster: List<Int>, ch: Int) =
            cluster.isNotEmpty() && cluster.first() == 0x0644 && ch == 0x0627

    val clusters = mutableListOf<List<Int>>()
    val cluster = mutableListOf<Int>()

    this.codePoints().forEachOrdered { ch ->
        if (!(UCharacter.getCombiningClass(ch) != 0) && !isLamAlef(cluster, ch)) {
            if (cluster.isNotEmpty()) {
                clusters += cluster.toList()
                cluster.clear()
            }
        }
        cluster += ch
    }

    if (cluster.isNotEmpty()) {
        clusters += cluster.toList()
        cluster.clear()
    }

    return fromCodePoints(*clusters.reversed().flatten().toIntArray())
}

class StringReverseTest {

    @Test
    fun ascii() {
        assertThat("hello".reverse()).isEqualTo("olleh")
    }

    @Test
    fun surrogates() {
        val emoji = fromCodePoints(0x1F645)
        assertThat(emoji.reverse()).isEqualTo(emoji)
    }

    @Test
    fun combining() {
        val combinedUmlaut = fromCodePoints(0x0041, 0x0308)
        assertThat(combinedUmlaut.reverse()).isEqualTo(combinedUmlaut)
    }

    @Test
    fun arabic() {
        assertThat("أَهْلًا وَ سَهْلًا".reverse()).isEqualTo("لًاهْسَ وَ لًاهْأَ")
    }

    @Test
    fun combiningAtBeginning() {
        val combinedUmlaut = fromCodePoints(0x0308, 0x0041)
        assertThat(combinedUmlaut.reverse())
                .isEqualTo(fromCodePoints(0x0041, 0x0308))
    }
}

private fun fromCodePoints(vararg codePoints: Int): String =
        String(codePoints, 0, codePoints.size)

For completeness, here are the Gradle dependencies for build.gradle:

dependencies {
    compile "org.jetbrains.kotlin:kotlin-stdlib-jdk8:$kotlin_version"
    compile group: 'com.ibm.icu', name: 'icu4j', version: '61.1'

    testCompile "org.jetbrains.kotlin:kotlin-test:$kotlin_version"
    testCompile "org.junit.jupiter:junit-jupiter-api:5.0.2"
    testCompile "org.assertj:assertj-core:3.9.0"
}
\$\endgroup\$
0

0

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.