Welcome to DotNetManiac

Soundex Encode Surnames

Date Revised: August 24 2009

Visual Studio 2008 | C#

There are many Soundex Encoders available on the internet and most will provide similar results but if you're looking for one where the algorithm is based on the soundex guidelines of the National Archives, then this is the one.

The complete source code for the class is below. It's well commented and should be easy enough follow. Note that this version returns a string array. That's because Rule 3 states that if a name has a prefix, two codes should be returned.

Live Demo

Enter surname :   

using System;
 
/// <summary>
/// http://www.archives.gov/publications/general-info-leaflets/55.html
/// 
/// DotNetManiac - July-02-07 8:54:15 AM
/// 
/// Every soundex code consists of a letter and three numbers, 
/// such as W-252. The letter is always the first letter of the surname. 
/// The numbers are assigned to the remaining letters of the surname according
/// to the soundex guide shown below. Zeroes are added at the end if necessary
/// to produce a four-character code. Additional letters are disregarded. 
/// 
/// 1 = B, F, P, V 
/// 2 = C, G, J, K, Q, S, X, Z 
/// 3 = D, T 
/// 4 = L 
/// 5 = M, N 
/// 6 = R 
///
/// Disregard the letters A, E, I, O, U, H, W, and Y. 
/// 
/// Additonal Rules:
/// 1. If the surname has any double letters, they should be treated as one letter.
/// 2. If the surname has different letters side-by-side that have the same number
///    in the soundex coding guide, they should be treated as one letter.
/// 3. If a surname has a prefix, such as Van, Con, De, Di, La, or Le, code both
///    with and without the prefix because the surname might be listed under 
///    either code. Note, however, that Mc and Mac are not considered prefixes.
/// 4. If a vowel (A, E, I, O, U) separates two consonants that have the same 
///    soundex code, the consonant to the right of the vowel is coded.
///    NOTE: If "H" or "W" separate two consonants that have the same soundex code, 
///    the consonant to the right of the vowel is not coded. 
/// </summary>
 
public static class Soundex
{
    public static string[] encode(string surname)
    {
        surname = surname.ToUpper();
 
        // two strings could be returned; see rule 3
        string[] s = new string[2];
 
        // the first letter always remains the same
        string encodedSurname = surname.Substring(0, 1);
 
        // remove the first letter and add to a temp string
        string temp = surname.Substring(1);
 
        // remove consonants as per RULE 4 from the temp string
        temp = rule4(temp);
 
        // encode the rest of the temp string and add it as
        // the first element is s[]
        s[0] = encodeSurname(encodedSurname, temp);
 
        // if the surname has a prefix as per rule 3
        // remove it and encode a second time.
        string encodedSurnameLessPrefix = string.Empty;
 
        if (hasPrefix(surname))
        {
            // the first letter remains the same (after prefix is removed)
            encodedSurnameLessPrefix = removePrefix(surname).Substring(0, 1);
            temp = removePrefix(surname).Substring(1);
            temp = rule4(temp);
            s[1] = encodeSurname(encodedSurnameLessPrefix, temp);
        }
 
        // return the result(s)
        return s;
    }
 
    private static string encodeSurname(string encodedSurname, string temp)
    {
        // variable used to match identical codes
        string prevCode = string.Empty;
 
        for (int i = 0; i <= temp.Length - 1; i++)
        {
            string encodedCharacter = encodeCharacter(temp.Substring(i, 1));
 
            // rule 1 and 2; do not add side by side identical codes
            if (encodedCharacter.CompareTo(prevCode) != 0)
            { encodedSurname += encodedCharacter; }
 
            prevCode = encodedCharacter;
        }
 
        // return 4 digit soundex code, padded with 0's if ness.
        return encodedSurname.PadRight(4, '0').Substring(0, 4);
    }
 
    private static string encodeCharacter(string character)
    {
        switch (character)
        {
            case "B":
            case "F":
            case "P":
            case "V":
                return "1";
            case "C":
            case "G":
            case "J":
            case "K":
            case "Q":
            case "S":
            case "X":
            case "Z":
                return "2";
            case "D":
            case "T":
                return "3";
            case "L":
                return "4";
            case "M":
            case "N":
                return "5";
            case "R":
                return "6";
            // disregard all other characters
 
            default: return string.Empty;
        }
    }
 
    // pass the surname less the first letter which will remain constant for the code
    private static string rule4(string surname)
    {
        // start at the second character since the first would not
        // seperate anything and don't go beyond the second last char
        // since the last one can't seperate anything either.
        int surnameLength = surname.Length;
 
        for (int i = 1; i <= surnameLength - 2; i++)
        {
            if (isVowel(surname.Substring(i, 1)))
            {
                // look for vowels, if there is a consonant to the left and right,
                // remove the one to the left if they have the same code
                if (isVowel(surname.Substring(i - 1, 1)) == false && isVowel(surname.Substring(i + 1, 1)) == false)
                {
                    // are they the same code?
                    if (encodeCharacter(surname.Substring(i - 1, 1)) == encodeCharacter(surname.Substring(i + 1, 1)))
                    {
                        surname = surname.Remove(i - 1, 1);
                        return surname;
                    }
                }
            }
            else if (surname.Substring(i, 1).Equals("H") || surname.Substring(i, 1).Equals("W"))
            {
                // look for H or W, if there is a consonant to the left and right,
                // remove the one to the right
                if (isVowel(surname.Substring(i - 1, 1)) == false && isVowel(surname.Substring(i + 1, 1)) == false)
                {
                    // are they the same code?
                    if (encodeCharacter(surname.Substring(i - 1, 1)) == encodeCharacter(surname.Substring(i + 1, 1)))
                    {
                        surname = surname.Remove(i + 1, 1);
                        return surname;
                    }
                }
            }
        }   
        return surname;
    }
 
    private static bool isVowel(string s)
    {
        if (System.Text.RegularExpressions.Regex.IsMatch(s, "AEIOU"))
        { return true; }
        else { return false; }
    }
 
    private static bool hasPrefix(string s)
    {
        //Van, Con, De, Di, La, or Le
        if (s.StartsWith("VAN") || s.StartsWith("CON") || s.StartsWith("DE") ||
            s.StartsWith("DI") || s.StartsWith("LA") || s.StartsWith("LE"))
        { return true; }
        else
        { return false; }
    }
 
    private static string removePrefix(string s)
    {
        if (s.StartsWith("VAN"))
        { return s.Replace("VAN", ""); }
        else if (s.StartsWith("CON"))
        { return s.Replace("CON", ""); }
        else if (s.StartsWith("DE"))
        { return s.Replace("DE", ""); }
        else if (s.StartsWith("DI"))
        { return s.Replace("DI", ""); }
        else if (s.StartsWith("LA"))
        { return s.Replace("LA", ""); }
        else if (s.StartsWith("LE"))
        { return s.Replace("LE", ""); }
        else
        { return ""; }
    }
}
Welcome to DotNetManiac