AlphabetConverter.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.text;

import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;

/**
 * <p>
 * Convert from one alphabet to another, with the possibility of leaving certain
 * characters unencoded.
 * </p>
 *
 * <p>
 * The target and 'do not encode' languages must be in the Unicode BMP, but the
 * source language does not.
 * </p>
 *
 * <p>
 * The encoding will all be of a fixed length, except for the 'do not encode'
 * chars, which will be of length 1
 * </p>
 *
 * <h2>Sample usage</h2>
 *
 * <pre>
 * Character[] originals;   // a, b, c, d
 * Character[] encoding;    // 0, 1, d
 * Character[] doNotEncode; // d
 *
 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals,
 * encoding, doNotEncode);
 *
 * ac.encode("a");    // 00
 * ac.encode("b");    // 01
 * ac.encode("c");    // 0d
 * ac.encode("d");    // d
 * ac.encode("abcd"); // 00010dd
 * </pre>
 *
 * <p>
 * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not
 * change internal state.
 * </p>
 *
 * @since 1.0
 */
public final class AlphabetConverter {

    /**
     * Arrow constant, used for converting the object into a string.
     */
    private static final String ARROW = " -> ";

    /**
     * Creates new String that contains just the given code point.
     *
     * @param i code point
     * @return a new string with the new code point
     * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
     */
    private static String codePointToString(final int i) {
        if (Character.charCount(i) == 1) {
            return String.valueOf((char) i);
        }
        return new String(Character.toChars(i));
    }

    /**
     * Converts characters to integers.
     *
     * @param chars array of characters
     * @return an equivalent array of integers
     */
    private static Integer[] convertCharsToIntegers(final Character[] chars) {
        if (ArrayUtils.isEmpty(chars)) {
            return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY;
        }
        final Integer[] integers = new Integer[chars.length];
        Arrays.setAll(integers, i -> (int) chars[i]);
        return integers;
    }

    /**
     * Creates an alphabet converter, for converting from the original alphabet,
     * to the encoded alphabet, while leaving
     * the characters in <em>doNotEncode</em> as they are (if possible).
     *
     * <p>Duplicate letters in either original or encoding will be ignored.</p>
     *
     * @param original an array of ints representing the original alphabet in
     *                 code points
     * @param encoding an array of ints representing the alphabet to be used for
     *                 encoding, in code points
     * @param doNotEncode an array of ints representing the chars to be encoded
     *                    using the original alphabet - every char
     *                    here must appear in both the previous params
     * @return The AlphabetConverter
     * @throws IllegalArgumentException if an AlphabetConverter cannot be
     *                                   constructed
     */
    public static AlphabetConverter createConverter(
            final Integer[] original,
            final Integer[] encoding,
            final Integer[] doNotEncode) {
        final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original));
        final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding));
        final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode));

        final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
        final Map<Integer, String> doNotEncodeMap = new HashMap<>();

        final int encodedLetterLength;

        for (final int i : doNotEncodeCopy) {
            if (!originalCopy.contains(i)) {
                throw new IllegalArgumentException(
                        "Can not use 'do not encode' list because original "
                                + "alphabet does not contain '"
                                + codePointToString(i) + "'");
            }

            if (!encodingCopy.contains(i)) {
                throw new IllegalArgumentException(
                        "Can not use 'do not encode' list because encoding alphabet does not contain '"
                                + codePointToString(i) + "'");
            }

            doNotEncodeMap.put(i, codePointToString(i));
        }

        if (encodingCopy.size() >= originalCopy.size()) {
            encodedLetterLength = 1;

            final Iterator<Integer> it = encodingCopy.iterator();

            for (final int originalLetter : originalCopy) {
                final String originalLetterAsString = codePointToString(originalLetter);

                if (doNotEncodeMap.containsKey(originalLetter)) {
                    originalToEncoded.put(originalLetter, originalLetterAsString);
                    encodedToOriginal.put(originalLetterAsString, originalLetterAsString);
                } else {
                    Integer next = it.next();

                    while (doNotEncodeCopy.contains(next)) {
                        next = it.next();
                    }

                    final String encodedLetter = codePointToString(next);

                    originalToEncoded.put(originalLetter, encodedLetter);
                    encodedToOriginal.put(encodedLetter, originalLetterAsString);
                }
            }

            return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);

        }
        if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
            throw new IllegalArgumentException(
                    "Must have at least two encoding characters (excluding "
                            + "those in the 'do not encode' list), but has "
                            + (encodingCopy.size() - doNotEncodeCopy.size()));
        }
        // we start with one which is our minimum, and because we do the
        // first division outside the loop
        int lettersSoFar = 1;

        // the first division takes into account that the doNotEncode
        // letters can't be in the leftmost place
        int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
                / (encodingCopy.size() - doNotEncodeCopy.size());

        while (lettersLeft / encodingCopy.size() >= 1) {
            lettersLeft /= encodingCopy.size();
            lettersSoFar++;
        }

        encodedLetterLength = lettersSoFar + 1;

        final AlphabetConverter ac =
                new AlphabetConverter(originalToEncoded,
                        encodedToOriginal,
                        encodedLetterLength);

        ac.addSingleEncoding(encodedLetterLength,
                StringUtils.EMPTY,
                encodingCopy,
                originalCopy.iterator(),
                doNotEncodeMap);

        return ac;
    }

    /**
     * Creates an alphabet converter, for converting from the original alphabet,
     * to the encoded alphabet, while leaving the characters in
     * <em>doNotEncode</em> as they are (if possible).
     *
     * <p>Duplicate letters in either original or encoding will be ignored.</p>
     *
     * @param original an array of chars representing the original alphabet
     * @param encoding an array of chars representing the alphabet to be used
     *                 for encoding
     * @param doNotEncode an array of chars to be encoded using the original
     *                    alphabet - every char here must appear in
     *                    both the previous params
     * @return The AlphabetConverter
     * @throws IllegalArgumentException if an AlphabetConverter cannot be
     *                                  constructed
     */
    public static AlphabetConverter createConverterFromChars(
            final Character[] original,
            final Character[] encoding,
            final Character[] doNotEncode) {
        return AlphabetConverter.createConverter(
                convertCharsToIntegers(original),
                convertCharsToIntegers(encoding),
                convertCharsToIntegers(doNotEncode));
    }

    /**
     * Creates a new converter from a map.
     *
     * @param originalToEncoded a map returned from getOriginalToEncoded()
     * @return The reconstructed AlphabetConverter
     * @see AlphabetConverter#getOriginalToEncoded()
     */
    public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) {
        final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded);
        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();

        int encodedLetterLength = 1;

        for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) {
            encodedToOriginal.put(e.getValue(), codePointToString(e.getKey()));

            if (e.getValue().length() > encodedLetterLength) {
                encodedLetterLength = e.getValue().length();
            }
        }

        return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength);
    }

    /**
     * Original string to be encoded.
     */
    private final Map<Integer, String> originalToEncoded;

    /**
     * Encoding alphabet.
     */
    private final Map<String, String> encodedToOriginal;

    /**
     * Length of the encoded letter.
     */
    private final int encodedLetterLength;

    /**
     * Hidden constructor for alphabet converter. Used by static helper methods.
     *
     * @param originalToEncoded original string to be encoded
     * @param encodedToOriginal encoding alphabet
     * @param encodedLetterLength length of the encoded letter
     */
    private AlphabetConverter(final Map<Integer, String> originalToEncoded,
                              final Map<String, String> encodedToOriginal,
                              final int encodedLetterLength) {

        this.originalToEncoded = originalToEncoded;
        this.encodedToOriginal = encodedToOriginal;
        this.encodedLetterLength = encodedLetterLength;
    }

    /**
     * Recursive method used when creating encoder/decoder.
     *
     * @param level at which point it should add a single encoding
     * @param currentEncoding current encoding
     * @param encoding letters encoding
     * @param originals original values
     * @param doNotEncodeMap map of values that should not be encoded
     */
    private void addSingleEncoding(final int level,
                                   final String currentEncoding,
                                   final Collection<Integer> encoding,
                                   final Iterator<Integer> originals,
                                   final Map<Integer, String> doNotEncodeMap) {

        if (level > 0) {
            for (final int encodingLetter : encoding) {
                if (!originals.hasNext()) {
                    return; // done encoding all the original alphabet
                }
                // this skips the doNotEncode chars if they are in the
                // leftmost place
                if (level != encodedLetterLength
                        || !doNotEncodeMap.containsKey(encodingLetter)) {
                    addSingleEncoding(level - 1,
                            currentEncoding
                                    + codePointToString(encodingLetter),
                            encoding,
                            originals,
                            doNotEncodeMap
                    );
                }
            }
        } else {
            Integer next = originals.next();

            while (doNotEncodeMap.containsKey(next)) {
                final String originalLetterAsString = codePointToString(next);

                originalToEncoded.put(next, originalLetterAsString);
                encodedToOriginal.put(originalLetterAsString,
                        originalLetterAsString);

                if (!originals.hasNext()) {
                    return;
                }

                next = originals.next();
            }

            final String originalLetterAsString = codePointToString(next);

            originalToEncoded.put(next, currentEncoding);
            encodedToOriginal.put(currentEncoding, originalLetterAsString);
        }
    }

    /**
     * Decodes a given string.
     *
     * @param encoded a string that has been encoded using this
     *                AlphabetConverter
     * @return The decoded string, {@code null} if the given string is null
     * @throws UnsupportedEncodingException if unexpected characters that
     *                                      cannot be handled are encountered
     */
    public String decode(final String encoded)
            throws UnsupportedEncodingException {
        if (encoded == null) {
            return null;
        }

        final StringBuilder result = new StringBuilder();

        for (int j = 0; j < encoded.length();) {
            final int i = encoded.codePointAt(j);
            final String s = codePointToString(i);

            if (s.equals(originalToEncoded.get(i))) {
                result.append(s);
                j++; // because we do not encode in Unicode extended the
                     // length of each encoded char is 1
            } else {
                if (j + encodedLetterLength > encoded.length()) {
                    throw new UnsupportedEncodingException("Unexpected end "
                            + "of string while decoding " + encoded);
                }
                final String nextGroup = encoded.substring(j,
                        j + encodedLetterLength);
                final String next = encodedToOriginal.get(nextGroup);
                if (next == null) {
                    throw new UnsupportedEncodingException(
                            "Unexpected string without decoding ("
                                    + nextGroup + ") in " + encoded);
                }
                result.append(next);
                j += encodedLetterLength;
            }
        }

        return result.toString();
    }

    /**
     * Encodes a given string.
     *
     * @param original the string to be encoded
     * @return The encoded string, {@code null} if the given string is null
     * @throws UnsupportedEncodingException if chars that are not supported are
     *                                      encountered
     */
    public String encode(final String original)
            throws UnsupportedEncodingException {
        if (original == null) {
            return null;
        }

        final StringBuilder sb = new StringBuilder();

        for (int i = 0; i < original.length();) {
            final int codePoint = original.codePointAt(i);

            final String nextLetter = originalToEncoded.get(codePoint);

            if (nextLetter == null) {
                throw new UnsupportedEncodingException(
                        "Couldn't find encoding for '"
                                + codePointToString(codePoint)
                                + "' in "
                                + original
                );
            }

            sb.append(nextLetter);

            i += Character.charCount(codePoint);
        }

        return sb.toString();
    }

    @Override
    public boolean equals(final Object obj) {
        if (obj == null) {
            return false;
        }
        if (obj == this) {
            return true;
        }
        if (!(obj instanceof AlphabetConverter)) {
            return false;
        }
        final AlphabetConverter other = (AlphabetConverter) obj;
        return originalToEncoded.equals(other.originalToEncoded)
                && encodedToOriginal.equals(other.encodedToOriginal)
                && encodedLetterLength == other.encodedLetterLength;
    }

    /**
     * Gets the length of characters in the encoded alphabet that are necessary
     * for each character in the original
     * alphabet.
     *
     * @return The length of the encoded char
     */
    public int getEncodedCharLength() {
        return encodedLetterLength;
    }

    /**
     * Gets the mapping from integer code point of source language to encoded
     * string. Use to reconstruct converter from
     * serialized map.
     *
     * @return The original map
     */
    public Map<Integer, String> getOriginalToEncoded() {
        return Collections.unmodifiableMap(originalToEncoded);
    }

    @Override
    public int hashCode() {
        return Objects.hash(originalToEncoded,
                encodedToOriginal,
                encodedLetterLength);
    }

    @Override
    public String toString() {
        final StringBuilder sb = new StringBuilder();
        // @formatter:off
        originalToEncoded.forEach((k, v) ->
            sb.append(codePointToString(k))
              .append(ARROW)
              .append(k)
              .append(System.lineSeparator()));
        // @formatter:on
        return sb.toString();
    }
}