001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.similarity;
018
019import java.util.Arrays;
020
021/**
022 * An algorithm for measuring the difference between two character sequences.
023 *
024 * <p>
025 * This is the number of changes needed to change one sequence into another,
026 * where each change is a single character modification (deletion, insertion
027 * or substitution).
028 * </p>
029 *
030 * @since 1.0
031 */
032public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResults> {
033
034    /**
035     * Default instance.
036     */
037    private static final LevenshteinDetailedDistance DEFAULT_INSTANCE = new LevenshteinDetailedDistance();
038    /**
039     * Threshold.
040     */
041    private final Integer threshold;
042
043    /**
044     * <p>
045     * This returns the default instance that uses a version
046     * of the algorithm that does not use a threshold parameter.
047     * </p>
048     *
049     * @see LevenshteinDetailedDistance#getDefaultInstance()
050     */
051    public LevenshteinDetailedDistance() {
052        this(null);
053    }
054
055    /**
056     * If the threshold is not null, distance calculations will be limited to a maximum length.
057     *
058     * <p>If the threshold is null, the unlimited version of the algorithm will be used.</p>
059     *
060     * @param threshold If this is null then distances calculations will not be limited. This may not be negative.
061     */
062    public LevenshteinDetailedDistance(final Integer threshold) {
063        if (threshold != null && threshold < 0) {
064            throw new IllegalArgumentException("Threshold must not be negative");
065        }
066        this.threshold = threshold;
067    }
068
069    /**
070     * <p>Find the Levenshtein distance between two Strings.</p>
071     *
072     * <p>A higher score indicates a greater distance.</p>
073     *
074     * <p>The previous implementation of the Levenshtein distance algorithm
075     * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
076     *
077     * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
078     * which can occur when my Java implementation is used with very large strings.<br>
079     * This implementation of the Levenshtein distance algorithm
080     * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
081     *
082     * <pre>
083     * distance.apply(null, *)             = IllegalArgumentException
084     * distance.apply(*, null)             = IllegalArgumentException
085     * distance.apply("","")               = 0
086     * distance.apply("","a")              = 1
087     * distance.apply("aaapppp", "")       = 7
088     * distance.apply("frog", "fog")       = 1
089     * distance.apply("fly", "ant")        = 3
090     * distance.apply("elephant", "hippo") = 7
091     * distance.apply("hippo", "elephant") = 7
092     * distance.apply("hippo", "zzzzzzzz") = 8
093     * distance.apply("hello", "hallo")    = 1
094     * </pre>
095     *
096     * @param left the first string, must not be null
097     * @param right the second string, must not be null
098     * @return result distance, or -1
099     * @throws IllegalArgumentException if either String input {@code null}
100     */
101    @Override
102    public LevenshteinResults apply(final CharSequence left, final CharSequence right) {
103        if (threshold != null) {
104            return limitedCompare(left, right, threshold);
105        }
106        return unlimitedCompare(left, right);
107    }
108
109    /**
110     * Gets the default instance.
111     *
112     * @return the default instace
113     */
114    public static LevenshteinDetailedDistance getDefaultInstance() {
115        return DEFAULT_INSTANCE;
116    }
117
118    /**
119     * Gets the distance threshold.
120     *
121     * @return the distance threshold
122     */
123    public Integer getThreshold() {
124        return threshold;
125    }
126
127    /**
128     * Find the Levenshtein distance between two CharSequences if it's less than or
129     * equal to a given threshold.
130     *
131     * <p>
132     * This implementation follows from Algorithms on Strings, Trees and
133     * Sequences by Dan Gusfield and Chas Emerick's implementation of the
134     * Levenshtein distance algorithm from <a
135     * href="http://www.merriampark.com/ld.htm"
136     * >http://www.merriampark.com/ld.htm</a>
137     * </p>
138     *
139     * <pre>
140     * limitedCompare(null, *, *)             = IllegalArgumentException
141     * limitedCompare(*, null, *)             = IllegalArgumentException
142     * limitedCompare(*, *, -1)               = IllegalArgumentException
143     * limitedCompare("","", 0)               = 0
144     * limitedCompare("aaapppp", "", 8)       = 7
145     * limitedCompare("aaapppp", "", 7)       = 7
146     * limitedCompare("aaapppp", "", 6))      = -1
147     * limitedCompare("elephant", "hippo", 7) = 7
148     * limitedCompare("elephant", "hippo", 6) = -1
149     * limitedCompare("hippo", "elephant", 7) = 7
150     * limitedCompare("hippo", "elephant", 6) = -1
151     * </pre>
152     *
153     * @param left the first CharSequence, must not be null
154     * @param right the second CharSequence, must not be null
155     * @param threshold the target threshold, must not be negative
156     * @return result distance, or -1
157     */
158    private static LevenshteinResults limitedCompare(CharSequence left,
159                                                     CharSequence right,
160                                                     final int threshold) { //NOPMD
161        if (left == null || right == null) {
162            throw new IllegalArgumentException("CharSequences must not be null");
163        }
164        if (threshold < 0) {
165            throw new IllegalArgumentException("Threshold must not be negative");
166        }
167
168        /*
169         * This implementation only computes the distance if it's less than or
170         * equal to the threshold value, returning -1 if it's greater. The
171         * advantage is performance: unbounded distance is O(nm), but a bound of
172         * k allows us to reduce it to O(km) time by only computing a diagonal
173         * stripe of width 2k + 1 of the cost table. It is also possible to use
174         * this to compute the unbounded Levenshtein distance by starting the
175         * threshold at 1 and doubling each time until the distance is found;
176         * this is O(dm), where d is the distance.
177         *
178         * One subtlety comes from needing to ignore entries on the border of
179         * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry
180         * to the left of the leftmost member We must ignore the entry above the
181         * rightmost member
182         *
183         * Another subtlety comes from our stripe running off the matrix if the
184         * strings aren't of the same size. Since string s is always swapped to
185         * be the shorter of the two, the stripe will always run off to the
186         * upper right instead of the lower left of the matrix.
187         *
188         * As a concrete example, suppose s is of length 5, t is of length 7,
189         * and our threshold is 1. In this case we're going to walk a stripe of
190         * length 3. The matrix would look like so:
191         *
192         * <pre>
193         *    1 2 3 4 5
194         * 1 |#|#| | | |
195         * 2 |#|#|#| | |
196         * 3 | |#|#|#| |
197         * 4 | | |#|#|#|
198         * 5 | | | |#|#|
199         * 6 | | | | |#|
200         * 7 | | | | | |
201         * </pre>
202         *
203         * Note how the stripe leads off the table as there is no possible way
204         * to turn a string of length 5 into one of length 7 in edit distance of
205         * 1.
206         *
207         * Additionally, this implementation decreases memory usage by using two
208         * single-dimensional arrays and swapping them back and forth instead of
209         * allocating an entire n by m matrix. This requires a few minor
210         * changes, such as immediately returning when it's detected that the
211         * stripe has run off the matrix and initially filling the arrays with
212         * large values so that entries we don't compute are ignored.
213         *
214         * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for
215         * some discussion.
216         */
217
218        int n = left.length(); // length of left
219        int m = right.length(); // length of right
220
221        // if one string is empty, the edit distance is necessarily the length of the other
222        if (n == 0) {
223            return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0);
224        } else if (m == 0) {
225            return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0);
226        }
227
228        boolean swapped = false;
229        if (n > m) {
230            // swap the two strings to consume less memory
231            final CharSequence tmp = left;
232            left = right;
233            right = tmp;
234            n = m;
235            m = right.length();
236            swapped = true;
237        }
238
239        int[] p = new int[n + 1]; // 'previous' cost array, horizontally
240        int[] d = new int[n + 1]; // cost array, horizontally
241        int[] tempD; // placeholder to assist in swapping p and d
242        final int[][] matrix = new int[m + 1][n + 1];
243
244        //filling the first row and first column values in the matrix
245        for (int index = 0; index <= n; index++) {
246            matrix[0][index] = index;
247        }
248        for (int index = 0; index <= m; index++) {
249            matrix[index][0] = index;
250        }
251
252        // fill in starting table values
253        final int boundary = Math.min(n, threshold) + 1;
254        for (int i = 0; i < boundary; i++) {
255            p[i] = i;
256        }
257        // these fills ensure that the value above the rightmost entry of our
258        // stripe will be ignored in following loop iterations
259        Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
260        Arrays.fill(d, Integer.MAX_VALUE);
261
262        // iterates through t
263        for (int j = 1; j <= m; j++) {
264            final char rightJ = right.charAt(j - 1); // jth character of right
265            d[0] = j;
266
267            // compute stripe indices, constrain to array size
268            final int min = Math.max(1, j - threshold);
269            final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min(
270                    n, j + threshold);
271
272            // the stripe may lead off of the table if s and t are of different sizes
273            if (min > max) {
274                return new LevenshteinResults(-1, 0, 0, 0);
275            }
276
277            // ignore entry left of leftmost
278            if (min > 1) {
279                d[min - 1] = Integer.MAX_VALUE;
280            }
281
282            // iterates through [min, max] in s
283            for (int i = min; i <= max; i++) {
284                if (left.charAt(i - 1) == rightJ) {
285                    // diagonally left and up
286                    d[i] = p[i - 1];
287                } else {
288                    // 1 + minimum of cell to the left, to the top, diagonally left and up
289                    d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]);
290                }
291                matrix[j][i] = d[i];
292            }
293
294            // copy current distance counts to 'previous row' distance counts
295            tempD = p;
296            p = d;
297            d = tempD;
298        }
299
300        // if p[n] is greater than the threshold, there's no guarantee on it being the correct distance
301        if (p[n] <= threshold) {
302            return findDetailedResults(left, right, matrix, swapped);
303        }
304        return new LevenshteinResults(-1, 0, 0, 0);
305    }
306
307    /**
308     * <p>Find the Levenshtein distance between two Strings.</p>
309     *
310     * <p>A higher score indicates a greater distance.</p>
311     *
312     * <p>The previous implementation of the Levenshtein distance algorithm
313     * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
314     *
315     * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
316     * which can occur when my Java implementation is used with very large strings.<br>
317     * This implementation of the Levenshtein distance algorithm
318     * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
319     *
320     * <pre>
321     * unlimitedCompare(null, *)             = IllegalArgumentException
322     * unlimitedCompare(*, null)             = IllegalArgumentException
323     * unlimitedCompare("","")               = 0
324     * unlimitedCompare("","a")              = 1
325     * unlimitedCompare("aaapppp", "")       = 7
326     * unlimitedCompare("frog", "fog")       = 1
327     * unlimitedCompare("fly", "ant")        = 3
328     * unlimitedCompare("elephant", "hippo") = 7
329     * unlimitedCompare("hippo", "elephant") = 7
330     * unlimitedCompare("hippo", "zzzzzzzz") = 8
331     * unlimitedCompare("hello", "hallo")    = 1
332     * </pre>
333     *
334     * @param left the first CharSequence, must not be null
335     * @param right the second CharSequence, must not be null
336     * @return result distance, or -1
337     * @throws IllegalArgumentException if either CharSequence input is {@code null}
338     */
339    private static LevenshteinResults unlimitedCompare(CharSequence left, CharSequence right) {
340        if (left == null || right == null) {
341            throw new IllegalArgumentException("CharSequences must not be null");
342        }
343
344        /*
345           The difference between this impl. and the previous is that, rather
346           than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
347           we maintain two single-dimensional arrays of length s.length() + 1.  The first, d,
348           is the 'current working' distance array that maintains the newest distance cost
349           counts as we iterate through the characters of String s.  Each time we increment
350           the index of String t we are comparing, d is copied to p, the second int[].  Doing so
351           allows us to retain the previous cost counts as required by the algorithm (taking
352           the minimum of the cost count to the left, up one, and diagonally up and to the left
353           of the current cost count being calculated).  (Note that the arrays aren't really
354           copied anymore, just switched...this is clearly much better than cloning an array
355           or doing a System.arraycopy() each time  through the outer loop.)
356
357           Effectively, the difference between the two implementations is this one does not
358           cause an out of memory condition when calculating the LD over two very large strings.
359         */
360
361        int n = left.length(); // length of left
362        int m = right.length(); // length of right
363
364        if (n == 0) {
365            return new LevenshteinResults(m, m, 0, 0);
366        } else if (m == 0) {
367            return new LevenshteinResults(n, 0, n, 0);
368        }
369        boolean swapped = false;
370        if (n > m) {
371            // swap the input strings to consume less memory
372            final CharSequence tmp = left;
373            left = right;
374            right = tmp;
375            n = m;
376            m = right.length();
377            swapped = true;
378        }
379
380        int[] p = new int[n + 1]; // 'previous' cost array, horizontally
381        int[] d = new int[n + 1]; // cost array, horizontally
382        int[] tempD; //placeholder to assist in swapping p and d
383        final int[][] matrix = new int[m + 1][n + 1];
384
385        // filling the first row and first column values in the matrix
386        for (int index = 0; index <= n; index++) {
387            matrix[0][index] = index;
388        }
389        for (int index = 0; index <= m; index++) {
390            matrix[index][0] = index;
391        }
392
393        // indexes into strings left and right
394        int i; // iterates through left
395        int j; // iterates through right
396
397        char rightJ; // jth character of right
398
399        int cost; // cost
400        for (i = 0; i <= n; i++) {
401            p[i] = i;
402        }
403
404        for (j = 1; j <= m; j++) {
405            rightJ = right.charAt(j - 1);
406            d[0] = j;
407
408            for (i = 1; i <= n; i++) {
409                cost = left.charAt(i - 1) == rightJ ? 0 : 1;
410                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
411                d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
412                //filling the matrix
413                matrix[j][i] = d[i];
414            }
415
416            // copy current distance counts to 'previous row' distance counts
417            tempD = p;
418            p = d;
419            d = tempD;
420        }
421        return findDetailedResults(left, right, matrix, swapped);
422    }
423
424    /**
425     * Finds count for each of the three [insert, delete, substitute] operations
426     * needed. This is based on the matrix formed based on the two character
427     * sequence.
428     *
429     * @param left character sequence which need to be converted from
430     * @param right character sequence which need to be converted to
431     * @param matrix two dimensional array containing
432     * @param swapped tells whether the value for left character sequence and right
433     *            character sequence were swapped to save memory
434     * @return result object containing the count of insert, delete and substitute and total count needed
435     */
436    private static LevenshteinResults findDetailedResults(final CharSequence left,
437                                                          final CharSequence right,
438                                                          final int[][] matrix,
439                                                          final boolean swapped) {
440
441        int delCount = 0;
442        int addCount = 0;
443        int subCount = 0;
444
445        int rowIndex = right.length();
446        int columnIndex = left.length();
447
448        int dataAtLeft = 0;
449        int dataAtTop = 0;
450        int dataAtDiagonal = 0;
451        int data = 0;
452        boolean deleted = false;
453        boolean added = false;
454
455        while (rowIndex >= 0 && columnIndex >= 0) {
456
457            if (columnIndex == 0) {
458                dataAtLeft = -1;
459            } else {
460                dataAtLeft = matrix[rowIndex][columnIndex - 1];
461            }
462            if (rowIndex == 0) {
463                dataAtTop = -1;
464            } else {
465                dataAtTop = matrix[rowIndex - 1][columnIndex];
466            }
467            if (rowIndex > 0 && columnIndex > 0) {
468                dataAtDiagonal = matrix[rowIndex - 1][columnIndex - 1];
469            } else {
470                dataAtDiagonal = -1;
471            }
472            if (dataAtLeft == -1 && dataAtTop == -1 && dataAtDiagonal == -1) {
473                break;
474            }
475            data = matrix[rowIndex][columnIndex];
476
477            // case in which the character at left and right are the same,
478            // in this case none of the counters will be incremented.
479            if (columnIndex > 0 && rowIndex > 0 && left.charAt(columnIndex - 1) == right.charAt(rowIndex - 1)) {
480                columnIndex--;
481                rowIndex--;
482                continue;
483            }
484
485            // handling insert and delete cases.
486            deleted = false;
487            added = false;
488            if (data - 1 == dataAtLeft && (data <= dataAtDiagonal && data <= dataAtTop)
489                    || (dataAtDiagonal == -1 && dataAtTop == -1)) { // NOPMD
490                columnIndex--;
491                if (swapped) {
492                    addCount++;
493                    added = true;
494                } else {
495                    delCount++;
496                    deleted = true;
497                }
498            } else if (data - 1 == dataAtTop && (data <= dataAtDiagonal && data <= dataAtLeft)
499                    || (dataAtDiagonal == -1 && dataAtLeft == -1)) { // NOPMD
500                rowIndex--;
501                if (swapped) {
502                    delCount++;
503                    deleted = true;
504                } else {
505                    addCount++;
506                    added = true;
507                }
508            }
509
510            // substituted case
511            if (!added && !deleted) {
512                subCount++;
513                columnIndex--;
514                rowIndex--;
515            }
516        }
517        return new LevenshteinResults(addCount + delCount + subCount, addCount, delCount, subCount);
518    }
519}