001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.codec.language;
018
019import java.util.Locale;
020
021import org.apache.commons.codec.EncoderException;
022import org.apache.commons.codec.StringEncoder;
023
024/**
025 * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977.
026 * <p>
027 * This class is immutable and thread-safe.
028 * </p>
029 *
030 * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
031 * @since 1.8
032 */
033public class MatchRatingApproachEncoder implements StringEncoder {
034
035    private static final String SPACE = " ";
036
037    private static final String EMPTY = "";
038
039    /**
040     * The plain letter equivalent of the accented letters.
041     */
042    private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
043            "AaEeIiOoUuYy" + // acute
044            "AaEeIiOoUuYy" + // circumflex
045            "AaOoNn" + // tilde
046            "AaEeIiOoUuYy" + // umlaut
047            "Aa" + // ring
048            "Cc" + // cedilla
049            "OoUu"; // double acute
050
051    /**
052     * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
053     */
054    private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
055            "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
056            "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
057            "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
058            "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
059            "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
060
061    private static final String[] DOUBLE_CONSONANT =
062            { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
063                   "TT", "VV", "WW", "XX", "YY", "ZZ" };
064
065    /**
066     * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
067     * spaces.
068     *
069     * <h2>API Usage</h2>
070     * <p>
071     * Consider this method private, it is package protected for unit testing only.
072     * </p>
073     *
074     * @param name
075     *            The name to be cleaned
076     * @return The cleaned name
077     */
078    String cleanName(final String name) {
079        String upperName = name.toUpperCase(Locale.ENGLISH);
080
081        final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
082        for (final String str : charsToTrim) {
083            upperName = upperName.replaceAll(str, EMPTY);
084        }
085
086        upperName = removeAccents(upperName);
087        upperName = upperName.replaceAll("\\s+", EMPTY);
088
089        return upperName;
090    }
091
092    /**
093     * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the
094     * Encoder interface Throws an EncoderException if input object is not of type java.lang.String.
095     *
096     * @param pObject
097     *            Object to encode
098     * @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the
099     *         String supplied.
100     * @throws EncoderException
101     *             if the parameter supplied is not of type java.lang.String
102     */
103    @Override
104    public final Object encode(final Object pObject) throws EncoderException {
105        if (!(pObject instanceof String)) {
106            throw new EncoderException(
107                    "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
108        }
109        return encode((String) pObject);
110    }
111
112    /**
113     * Encodes a String using the Match Rating Approach (MRA) algorithm.
114     *
115     * @param name
116     *            String object to encode
117     * @return The MRA code corresponding to the String supplied
118     */
119    @Override
120    public final String encode(String name) {
121        // Bulletproof for trivial input - NINO
122        if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
123            return EMPTY;
124        }
125
126        // Preprocessing
127        name = cleanName(name);
128
129        // BEGIN: Actual encoding part of the algorithm...
130        // 1. Delete all vowels unless the vowel begins the word
131        name = removeVowels(name);
132
133        // 2. Remove second consonant from any double consonant
134        name = removeDoubleConsonants(name);
135
136        // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters
137        name = getFirst3Last3(name);
138
139        return name;
140    }
141
142    /**
143     * Gets the first and last 3 letters of a name (if &gt; 6 characters) Else just returns the name.
144     *
145     * <h2>API Usage</h2>
146     * <p>
147     * Consider this method private, it is package protected for unit testing only.
148     * </p>
149     *
150     * @param name
151     *            The string to get the substrings from
152     * @return Annexed first and last 3 letters of input word.
153     */
154    String getFirst3Last3(final String name) {
155        final int nameLength = name.length();
156
157        if (nameLength > 6) {
158            final String firstThree = name.substring(0, 3);
159            final String lastThree = name.substring(nameLength - 3, nameLength);
160            return firstThree + lastThree;
161        }
162        return name;
163    }
164
165    /**
166     * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
167     * min rating. Values strictly from documentation.
168     *
169     * <h2>API Usage</h2>
170     * <p>
171     * Consider this method private, it is package protected for unit testing only.
172     * </p>
173     *
174     * @param sumLength
175     *            The length of 2 strings sent down
176     * @return The min rating value
177     */
178    int getMinRating(final int sumLength) {
179        int minRating = 0;
180
181        if (sumLength <= 4) {
182            minRating = 5;
183        } else if (sumLength <= 7) { // already know it is at least 5
184            minRating = 4;
185        } else if (sumLength <= 11) { // already know it is at least 8
186            minRating = 3;
187        } else if (sumLength == 12) {
188            minRating = 2;
189        } else {
190            minRating = 1; // docs said little here.
191        }
192
193        return minRating;
194    }
195
196    /**
197     * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
198     * strings are cleaned in the same way as {@link #encode(String)}.
199     *
200     * @param name1
201     *            First of the 2 strings (names) to compare
202     * @param name2
203     *            Second of the 2 names to compare
204     * @return {@code true} if the encodings are identical {@code false} otherwise.
205     */
206    public boolean isEncodeEquals(String name1, String name2) {
207        // Bulletproof for trivial input - NINO
208        if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
209            return false;
210        }
211        if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
212            return false;
213        }
214        if (name1.length() == 1 || name2.length() == 1) {
215            return false;
216        }
217        if (name1.equalsIgnoreCase(name2)) {
218            return true;
219        }
220
221        // Preprocessing
222        name1 = cleanName(name1);
223        name2 = cleanName(name2);
224
225        // Actual MRA Algorithm
226
227        // 1. Remove vowels
228        name1 = removeVowels(name1);
229        name2 = removeVowels(name2);
230
231        // 2. Remove double consonants
232        name1 = removeDoubleConsonants(name1);
233        name2 = removeDoubleConsonants(name2);
234
235        // 3. Reduce down to 3 letters
236        name1 = getFirst3Last3(name1);
237        name2 = getFirst3Last3(name2);
238
239        // 4. Check for length difference - if 3 or greater, then no similarity
240        // comparison is done
241        if (Math.abs(name1.length() - name2.length()) >= 3) {
242            return false;
243        }
244
245        // 5. Obtain the minimum rating value by calculating the length sum of the
246        // encoded Strings and sending it down.
247        final int sumLength = Math.abs(name1.length() + name2.length());
248        final int minRating = getMinRating(sumLength);
249
250        // 6. Process the encoded Strings from left to right and remove any
251        // identical characters found from both Strings respectively.
252        final int count = leftToRightThenRightToLeftProcessing(name1, name2);
253
254        // 7. Each PNI item that has a similarity rating equal to or greater than
255        // the min is considered to be a good candidate match
256        return count >= minRating;
257
258    }
259
260    /**
261     * Determines if a letter is a vowel.
262     *
263     * <h2>API Usage</h2>
264     * <p>
265     * Consider this method private, it is package protected for unit testing only.
266     * </p>
267     *
268     * @param letter
269     *            The letter under investigation
270     * @return True if a vowel, else false
271     */
272    boolean isVowel(final String letter) {
273        return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
274               letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
275    }
276
277    /**
278     * Processes the names from left to right (first) then right to left removing identical letters in same positions.
279     * Then subtracts the longer string that remains from 6 and returns this.
280     *
281     * <h2>API Usage</h2>
282     * <p>
283     * Consider this method private, it is package protected for unit testing only.
284     * </p>
285     *
286     * @param name1
287     *            name2
288     * @return the length as above
289     */
290    int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
291        final char[] name1Char = name1.toCharArray();
292        final char[] name2Char = name2.toCharArray();
293
294        final int name1Size = name1.length() - 1;
295        final int name2Size = name2.length() - 1;
296
297        String name1LtRStart = EMPTY;
298        String name1LtREnd = EMPTY;
299
300        String name2RtLStart = EMPTY;
301        String name2RtLEnd = EMPTY;
302
303        for (int i = 0; i < name1Char.length; i++) {
304            if (i > name2Size) {
305                break;
306            }
307
308            name1LtRStart = name1.substring(i, i + 1);
309            name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
310
311            name2RtLStart = name2.substring(i, i + 1);
312            name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
313
314            // Left to right...
315            if (name1LtRStart.equals(name2RtLStart)) {
316                name1Char[i] = ' ';
317                name2Char[i] = ' ';
318            }
319
320            // Right to left...
321            if (name1LtREnd.equals(name2RtLEnd)) {
322                name1Char[name1Size - i] = ' ';
323                name2Char[name2Size - i] = ' ';
324            }
325        }
326
327        // Char arrays -> string & remove extraneous space
328        final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
329        final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
330
331        // Final bit - subtract the longest string from 6 and return this int value
332        if (strA.length() > strB.length()) {
333            return Math.abs(6 - strA.length());
334        }
335        return Math.abs(6 - strB.length());
336    }
337
338    /**
339     * Removes accented letters and replaces with non-accented ASCII equivalent Case is preserved.
340     * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
341     *
342     * @param accentedWord
343     *            The word that may have accents in it.
344     * @return De-accented word
345     */
346    String removeAccents(final String accentedWord) {
347        if (accentedWord == null) {
348            return null;
349        }
350
351        final StringBuilder sb = new StringBuilder();
352        final int n = accentedWord.length();
353
354        for (int i = 0; i < n; i++) {
355            final char c = accentedWord.charAt(i);
356            final int pos = UNICODE.indexOf(c);
357            if (pos > -1) {
358                sb.append(PLAIN_ASCII.charAt(pos));
359            } else {
360                sb.append(c);
361            }
362        }
363
364        return sb.toString();
365    }
366
367    /**
368     * Replaces any double consonant pair with the single letter equivalent.
369     *
370     * <h2>API Usage</h2>
371     * <p>
372     * Consider this method private, it is package protected for unit testing only.
373     * </p>
374     *
375     * @param name
376     *            String to have double consonants removed
377     * @return Single consonant word
378     */
379    String removeDoubleConsonants(final String name) {
380        String replacedName = name.toUpperCase(Locale.ENGLISH);
381        for (final String dc : DOUBLE_CONSONANT) {
382            if (replacedName.contains(dc)) {
383                final String singleLetter = dc.substring(0, 1);
384                replacedName = replacedName.replace(dc, singleLetter);
385            }
386        }
387        return replacedName;
388    }
389
390    /**
391     * Deletes all vowels unless the vowel begins the word.
392     *
393     * <h2>API Usage</h2>
394     * <p>
395     * Consider this method private, it is package protected for unit testing only.
396     * </p>
397     *
398     * @param name
399     *            The name to have vowels removed
400     * @return De-voweled word
401     */
402    String removeVowels(String name) {
403        // Extract first letter
404        final String firstLetter = name.substring(0, 1);
405
406        name = name.replace("A", EMPTY);
407        name = name.replace("E", EMPTY);
408        name = name.replace("I", EMPTY);
409        name = name.replace("O", EMPTY);
410        name = name.replace("U", EMPTY);
411
412        name = name.replaceAll("\\s{2,}\\b", SPACE);
413
414        // return isVowel(firstLetter) ? (firstLetter + name) : name;
415        if (isVowel(firstLetter)) {
416            return firstLetter + name;
417        }
418        return name;
419    }
420}