001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.io.UnsupportedEncodingException; 020import java.util.Arrays; 021import java.util.Collection; 022import java.util.Collections; 023import java.util.HashMap; 024import java.util.Iterator; 025import java.util.LinkedHashMap; 026import java.util.LinkedHashSet; 027import java.util.Map; 028import java.util.Map.Entry; 029import java.util.Objects; 030import java.util.Set; 031 032/** 033 * <p> 034 * Convert from one alphabet to another, with the possibility of leaving certain 035 * characters unencoded. 036 * </p> 037 * 038 * <p> 039 * The target and do not encode languages must be in the Unicode BMP, but the 040 * source language does not. 041 * </p> 042 * 043 * <p> 044 * The encoding will all be of a fixed length, except for the 'do not encode' 045 * chars, which will be of length 1 046 * </p> 047 * 048 * <h3>Sample usage</h3> 049 * 050 * <pre> 051 * Character[] originals; // a, b, c, d 052 * Character[] encoding; // 0, 1, d 053 * Character[] doNotEncode; // d 054 * 055 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals, 056 * encoding, doNotEncode); 057 * 058 * ac.encode("a"); // 00 059 * ac.encode("b"); // 01 060 * ac.encode("c"); // 0d 061 * ac.encode("d"); // d 062 * ac.encode("abcd"); // 00010dd 063 * </pre> 064 * 065 * <p> 066 * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not 067 * change internal state. 068 * </p> 069 * 070 * @since 1.0 071 * 072 */ 073public final class AlphabetConverter { 074 075 /** 076 * Original string to be encoded. 077 */ 078 private final Map<Integer, String> originalToEncoded; 079 /** 080 * Encoding alphabet. 081 */ 082 private final Map<String, String> encodedToOriginal; 083 /** 084 * Length of the encoded letter. 085 */ 086 private final int encodedLetterLength; 087 /** 088 * Arrow constant, used for converting the object into a string. 089 */ 090 private static final String ARROW = " -> "; 091 092 /** 093 * Hidden constructor for alphabet converter. Used by static helper methods. 094 * 095 * @param originalToEncoded original string to be encoded 096 * @param encodedToOriginal encoding alphabet 097 * @param encodedLetterLength length of the encoded letter 098 */ 099 private AlphabetConverter(final Map<Integer, String> originalToEncoded, 100 final Map<String, String> encodedToOriginal, 101 final int encodedLetterLength) { 102 103 this.originalToEncoded = originalToEncoded; 104 this.encodedToOriginal = encodedToOriginal; 105 this.encodedLetterLength = encodedLetterLength; 106 } 107 108 /** 109 * Encode a given string. 110 * 111 * @param original the string to be encoded 112 * @return the encoded string, {@code null} if the given string is null 113 * @throws UnsupportedEncodingException if chars that are not supported are 114 * encountered 115 */ 116 public String encode(final String original) 117 throws UnsupportedEncodingException { 118 if (original == null) { 119 return null; 120 } 121 122 final StringBuilder sb = new StringBuilder(); 123 124 for (int i = 0; i < original.length();) { 125 final int codepoint = original.codePointAt(i); 126 127 final String nextLetter = originalToEncoded.get(codepoint); 128 129 if (nextLetter == null) { 130 throw new UnsupportedEncodingException( 131 "Couldn't find encoding for '" 132 + codePointToString(codepoint) 133 + "' in " 134 + original 135 ); 136 } 137 138 sb.append(nextLetter); 139 140 i += Character.charCount(codepoint); 141 } 142 143 return sb.toString(); 144 } 145 146 /** 147 * Decode a given string. 148 * 149 * @param encoded a string that has been encoded using this 150 * AlphabetConverter 151 * @return the decoded string, {@code null} if the given string is null 152 * @throws UnsupportedEncodingException if unexpected characters that 153 * cannot be handled are encountered 154 */ 155 public String decode(final String encoded) 156 throws UnsupportedEncodingException { 157 if (encoded == null) { 158 return null; 159 } 160 161 final StringBuilder result = new StringBuilder(); 162 163 for (int j = 0; j < encoded.length();) { 164 final Integer i = encoded.codePointAt(j); 165 final String s = codePointToString(i); 166 167 if (s.equals(originalToEncoded.get(i))) { 168 result.append(s); 169 j++; // because we do not encode in Unicode extended the 170 // length of each encoded char is 1 171 } else { 172 if (j + encodedLetterLength > encoded.length()) { 173 throw new UnsupportedEncodingException("Unexpected end " 174 + "of string while decoding " + encoded); 175 } 176 final String nextGroup = encoded.substring(j, 177 j + encodedLetterLength); 178 final String next = encodedToOriginal.get(nextGroup); 179 if (next == null) { 180 throw new UnsupportedEncodingException( 181 "Unexpected string without decoding (" 182 + nextGroup + ") in " + encoded); 183 } 184 result.append(next); 185 j += encodedLetterLength; 186 } 187 } 188 189 return result.toString(); 190 } 191 192 /** 193 * Get the length of characters in the encoded alphabet that are necessary 194 * for each character in the original 195 * alphabet. 196 * 197 * @return the length of the encoded char 198 */ 199 public int getEncodedCharLength() { 200 return encodedLetterLength; 201 } 202 203 /** 204 * Get the mapping from integer code point of source language to encoded 205 * string. Use to reconstruct converter from 206 * serialized map. 207 * 208 * @return the original map 209 */ 210 public Map<Integer, String> getOriginalToEncoded() { 211 return Collections.unmodifiableMap(originalToEncoded); 212 } 213 214 /** 215 * Recursive method used when creating encoder/decoder. 216 * 217 * @param level at which point it should add a single encoding 218 * @param currentEncoding current encoding 219 * @param encoding letters encoding 220 * @param originals original values 221 * @param doNotEncodeMap map of values that should not be encoded 222 */ 223 @SuppressWarnings("PMD") 224 private void addSingleEncoding(final int level, 225 final String currentEncoding, 226 final Collection<Integer> encoding, 227 final Iterator<Integer> originals, 228 final Map<Integer, String> doNotEncodeMap) { 229 230 if (level > 0) { 231 for (final int encodingLetter : encoding) { 232 if (originals.hasNext()) { 233 234 // this skips the doNotEncode chars if they are in the 235 // leftmost place 236 if (level != encodedLetterLength 237 || !doNotEncodeMap.containsKey(encodingLetter)) { 238 addSingleEncoding(level - 1, 239 currentEncoding 240 + codePointToString(encodingLetter), 241 encoding, 242 originals, 243 doNotEncodeMap 244 ); 245 } 246 } else { 247 return; // done encoding all the original alphabet 248 } 249 } 250 } else { 251 Integer next = originals.next(); 252 253 while (doNotEncodeMap.containsKey(next)) { 254 final String originalLetterAsString = codePointToString(next); 255 256 originalToEncoded.put(next, originalLetterAsString); 257 encodedToOriginal.put(originalLetterAsString, 258 originalLetterAsString); 259 260 if (!originals.hasNext()) { 261 return; 262 } 263 264 next = originals.next(); 265 } 266 267 final String originalLetterAsString = codePointToString(next); 268 269 originalToEncoded.put(next, currentEncoding); 270 encodedToOriginal.put(currentEncoding, originalLetterAsString); 271 } 272 } 273 274 @Override 275 public String toString() { 276 final StringBuilder sb = new StringBuilder(); 277 278 for (final Entry<Integer, String> entry 279 : originalToEncoded.entrySet()) { 280 sb.append(codePointToString(entry.getKey())) 281 .append(ARROW) 282 .append(entry.getValue()).append(System.lineSeparator()); 283 } 284 285 return sb.toString(); 286 } 287 288 @Override 289 public boolean equals(final Object obj) { 290 if (obj == null) { 291 return false; 292 } 293 if (obj == this) { 294 return true; 295 } 296 if (!(obj instanceof AlphabetConverter)) { 297 return false; 298 } 299 final AlphabetConverter other = (AlphabetConverter) obj; 300 return originalToEncoded.equals(other.originalToEncoded) 301 && encodedToOriginal.equals(other.encodedToOriginal) 302 && encodedLetterLength == other.encodedLetterLength; 303 } 304 305 @Override 306 public int hashCode() { 307 return Objects.hash(originalToEncoded, 308 encodedToOriginal, 309 encodedLetterLength); 310 } 311 312 // -- static methods 313 314 /** 315 * Create a new converter from a map. 316 * 317 * @param originalToEncoded a map returned from getOriginalToEncoded() 318 * @return the reconstructed AlphabetConverter 319 * @see AlphabetConverter#getOriginalToEncoded() 320 */ 321 public static AlphabetConverter createConverterFromMap( 322 final Map<Integer, String> originalToEncoded) { 323 final Map<Integer, String> unmodifiableOriginalToEncoded = 324 Collections.unmodifiableMap(originalToEncoded); 325 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 326 327 int encodedLetterLength = 1; 328 329 for (final Entry<Integer, String> e 330 : unmodifiableOriginalToEncoded.entrySet()) { 331 final String originalAsString = codePointToString(e.getKey()); 332 encodedToOriginal.put(e.getValue(), originalAsString); 333 334 if (e.getValue().length() > encodedLetterLength) { 335 encodedLetterLength = e.getValue().length(); 336 } 337 } 338 339 return new AlphabetConverter(unmodifiableOriginalToEncoded, 340 encodedToOriginal, 341 encodedLetterLength); 342 } 343 344 /** 345 * Create an alphabet converter, for converting from the original alphabet, 346 * to the encoded alphabet, while leaving the characters in 347 * <em>doNotEncode</em> as they are (if possible). 348 * 349 * <p>Duplicate letters in either original or encoding will be ignored.</p> 350 * 351 * @param original an array of chars representing the original alphabet 352 * @param encoding an array of chars representing the alphabet to be used 353 * for encoding 354 * @param doNotEncode an array of chars to be encoded using the original 355 * alphabet - every char here must appear in 356 * both the previous params 357 * @return the AlphabetConverter 358 * @throws IllegalArgumentException if an AlphabetConverter cannot be 359 * constructed 360 */ 361 public static AlphabetConverter createConverterFromChars( 362 final Character[] original, 363 final Character[] encoding, 364 final Character[] doNotEncode) { 365 return AlphabetConverter.createConverter( 366 convertCharsToIntegers(original), 367 convertCharsToIntegers(encoding), 368 convertCharsToIntegers(doNotEncode)); 369 } 370 371 /** 372 * Convert characters to integers. 373 * 374 * @param chars array of characters 375 * @return an equivalent array of integers 376 */ 377 private static Integer[] convertCharsToIntegers(final Character[] chars) { 378 if (chars == null || chars.length == 0) { 379 return new Integer[0]; 380 } 381 final Integer[] integers = new Integer[chars.length]; 382 for (int i = 0; i < chars.length; i++) { 383 integers[i] = (int) chars[i]; 384 } 385 return integers; 386 } 387 388 /** 389 * Create an alphabet converter, for converting from the original alphabet, 390 * to the encoded alphabet, while leaving 391 * the characters in <em>doNotEncode</em> as they are (if possible). 392 * 393 * <p>Duplicate letters in either original or encoding will be ignored.</p> 394 * 395 * @param original an array of ints representing the original alphabet in 396 * codepoints 397 * @param encoding an array of ints representing the alphabet to be used for 398 * encoding, in codepoints 399 * @param doNotEncode an array of ints representing the chars to be encoded 400 * using the original alphabet - every char 401 * here must appear in both the previous params 402 * @return the AlphabetConverter 403 * @throws IllegalArgumentException if an AlphabetConverter cannot be 404 * constructed 405 */ 406 public static AlphabetConverter createConverter( 407 final Integer[] original, 408 final Integer[] encoding, 409 final Integer[] doNotEncode) { 410 final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.<Integer> asList(original)); 411 final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.<Integer> asList(encoding)); 412 final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.<Integer> asList(doNotEncode)); 413 414 final Map<Integer, String> originalToEncoded = new LinkedHashMap<>(); 415 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 416 final Map<Integer, String> doNotEncodeMap = new HashMap<>(); 417 418 int encodedLetterLength; 419 420 for (final int i : doNotEncodeCopy) { 421 if (!originalCopy.contains(i)) { 422 throw new IllegalArgumentException( 423 "Can not use 'do not encode' list because original " 424 + "alphabet does not contain '" 425 + codePointToString(i) + "'"); 426 } 427 428 if (!encodingCopy.contains(i)) { 429 throw new IllegalArgumentException( 430 "Can not use 'do not encode' list because encoding alphabet does not contain '" 431 + codePointToString(i) + "'"); 432 } 433 434 doNotEncodeMap.put(i, codePointToString(i)); 435 } 436 437 if (encodingCopy.size() >= originalCopy.size()) { 438 encodedLetterLength = 1; 439 440 final Iterator<Integer> it = encodingCopy.iterator(); 441 442 for (final int originalLetter : originalCopy) { 443 final String originalLetterAsString = 444 codePointToString(originalLetter); 445 446 if (doNotEncodeMap.containsKey(originalLetter)) { 447 originalToEncoded.put(originalLetter, 448 originalLetterAsString); 449 encodedToOriginal.put(originalLetterAsString, 450 originalLetterAsString); 451 } else { 452 Integer next = it.next(); 453 454 while (doNotEncodeCopy.contains(next)) { 455 next = it.next(); 456 } 457 458 final String encodedLetter = codePointToString(next); 459 460 originalToEncoded.put(originalLetter, encodedLetter); 461 encodedToOriginal.put(encodedLetter, 462 originalLetterAsString); 463 } 464 } 465 466 return new AlphabetConverter(originalToEncoded, 467 encodedToOriginal, 468 encodedLetterLength); 469 470 } else if (encodingCopy.size() - doNotEncodeCopy.size() < 2) { 471 throw new IllegalArgumentException( 472 "Must have at least two encoding characters (excluding " 473 + "those in the 'do not encode' list), but has " 474 + (encodingCopy.size() - doNotEncodeCopy.size())); 475 } else { 476 // we start with one which is our minimum, and because we do the 477 // first division outside the loop 478 int lettersSoFar = 1; 479 480 // the first division takes into account that the doNotEncode 481 // letters can't be in the leftmost place 482 int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size()) 483 / (encodingCopy.size() - doNotEncodeCopy.size()); 484 485 while (lettersLeft / encodingCopy.size() >= 1) { 486 lettersLeft = lettersLeft / encodingCopy.size(); 487 lettersSoFar++; 488 } 489 490 encodedLetterLength = lettersSoFar + 1; 491 492 final AlphabetConverter ac = 493 new AlphabetConverter(originalToEncoded, 494 encodedToOriginal, 495 encodedLetterLength); 496 497 ac.addSingleEncoding(encodedLetterLength, 498 "", 499 encodingCopy, 500 originalCopy.iterator(), 501 doNotEncodeMap); 502 503 return ac; 504 } 505 } 506 507 /** 508 * Create new String that contains just the given code point. 509 * 510 * @param i code point 511 * @return a new string with the new code point 512 * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html" 513 */ 514 private static String codePointToString(final int i) { 515 if (Character.charCount(i) == 1) { 516 return String.valueOf((char) i); 517 } 518 return new String(Character.toChars(i)); 519 } 520}