Implementation of the Soundex algorithm (function) in different programming languages

Implementation of the Soundex algorithm (function) in different programming languages

Soundex is a phonetic algorithm for indexing names by sound, as pronounced in English, SOUNDEX codes from different strings can be compared to see how similar the strings sound when spoken.

The first character of the code is the first character of the expression, converted to upper case. The second through fourth characters of the code are numbers that represent the letters in the expression. The letters A, E, I, O, U, H, W, and Y are ignored unless they are the first letter of the string. All international alphabetic characters outside the A-Z range are treated as vowels. Hence, two strings that sound almost the same should have identical soundex strings. For instance, the words "text" and "tixt" both produce a soundex of “T230”.

In this article, you'll find its implementation in the following programming languages :

Let's get started !

C

#include <stdio.h>

static char code[128] = { 0 };

const char* soundex(const char *s)
{
	static char out[5];
	int c, prev, i;
 
	out[0] = out[4] = 0;
	if (!s || !*s) return out;
 
	out[0] = *s++;
 
	/* first letter, though not coded, can still affect next letter: Pfister */
	prev = code[(int)out[0]];
	for (i = 1; *s && i < 4; s++) {
		if ((c = code[(int)*s]) == prev) continue;
 
		if (c == -1) prev = 0;	/* vowel as separator */
		else if (c > 0) {
			out[i++] = c + '0';
			prev = c;
		}
	}
	while (i < 4) out[i++] = '0';
	return out;
}

void add_code(const char *s, int c)
{
	while (*s) {
		code[(int)*s] = code[0x20 ^ (int)*s] = c;
		s++;
	}
}
 
void init()
{
	static const char *cls[] =
		{ "AEIOU", "", "BFPV", "CGJKQSXZ", "DT", "L", "MN", "R", 0};
	int i;
	for (i = 0; cls[i]; i++)
		add_code(cls[i], i - 1);
}

Usage

int main()
{
    init();
    /* J126 */
    printf(soundex("Javascript"));
 
    return 0;
}

C#

using System.Text.RegularExpressions;

public static class Soundex
{
    public static string For(string word)
    {
        const int MaxSoundexCodeLength = 4;

        var soundexCode = new StringBuilder();
        var previousWasHOrW = false;

        word = Regex.Replace(
            word == null ? string.Empty : word.ToUpper(),
                @"[^\w\s]",
                    string.Empty);

        if (string.IsNullOrEmpty(word))
            return string.Empty.PadRight(MaxSoundexCodeLength, '0');

        soundexCode.Append(word.First());

        for (var i = 1; i < word.Length; i++)
        {
            var numberCharForCurrentLetter =
                GetCharNumberForLetter(word[i]);

            if (i == 1 &&
                    numberCharForCurrentLetter ==
                        GetCharNumberForLetter(soundexCode[0]))
                continue;

            if (soundexCode.Length > 2 && previousWasHOrW &&
                    numberCharForCurrentLetter ==
                        soundexCode[soundexCode.Length - 2])
                continue;

            if (soundexCode.Length > 0 &&
                    numberCharForCurrentLetter ==
                        soundexCode[soundexCode.Length - 1])
                continue;

            soundexCode.Append(numberCharForCurrentLetter);

            previousWasHOrW = "HW".Contains(word[i]);
        }

        return soundexCode
                .Replace("0", string.Empty)
                    .ToString()
                        .PadRight(MaxSoundexCodeLength, '0')
                            .Substring(0, MaxSoundexCodeLength);
    }

    private static char GetCharNumberForLetter(char letter)
    {
        if ("BFPV".Contains(letter)) return '1';
        if ("CGJKQSXZ".Contains(letter)) return '2';
        if ("DT".Contains(letter)) return '3';
        if ('L' == letter) return '4';
        if ("MN".Contains(letter)) return '5';
        if ('R' == letter) return '6';

        return '0';
    }
}

Usage

Soundex.For("CSharp Language") == Soundex.For("CSherp Language"); // True as C614 == C614

D

The D standard library (Phobos) contains already a soundex function.

import std.stdio: writeln;
import std.string: soundex;
 
void main() {
    assert(soundex("soundex") == "S532");
    assert(soundex("example") == "E251");
    assert(soundex("ciondecks") == "C532");
    assert(soundex("ekzampul") == "E251");
    assert(soundex("Robert") == "R163");
    assert(soundex("Rupert") == "R163");
    assert(soundex("Rubin") == "R150");
    assert(soundex("Ashcraft") == "A261");
    assert(soundex("Ashcroft") == "A261");
    assert(soundex("Tymczak") == "T522");
}

F#

let americanSoundex (x : string) = 
    let toString (xs : char list) = new System.String(xs |> Array.ofList)
    let _americanSoundex =
        let toUpper (x : string) = x.ToUpper()
        let toArray (x : string) = x.ToCharArray()

        let f1 ch = 
            match ch with
            | 'H' | 'W' -> false
            | _ -> true

        let f2 ch =
            match ch with
            | 'B' | 'F' | 'P' | 'V' -> '1'
            | 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' -> '2'
            | 'D' | 'T' -> '3'
            | 'L' -> '4'        
            | 'M' | 'N' -> '5'
            | 'R' -> '6'
            | _ -> ch

        let rec f3 xs =
            match xs with
            | h0 :: h1 :: t -> h0 :: f3 (if (h0 = h1) then t else (h1 :: t))
            | h :: _ -> [h]
            | _ -> []

        let f4 ch = 
            match ch with
            | 'A' | 'E' | 'I' | 'O' | 'U' | 'Y' -> false
            | _ -> true

        let f5 ch first =
            if ('0' <= ch && ch <= '9') then first
            else ch

        let f6 xs =
            let len = List.length xs
            seq{for i = 0 to 3 - len do yield '0'} 
                |> Seq.append (xs |> Seq.take (System.Math.Min(4, len)))
                |> Seq.toList

        let a = x |> toUpper |> toArray |> Array.toList
        let b = a |> List.filter f1 //1
        let c = b |> List.map f2 //2
        let d = c |> f3 //3
        let e = d |> List.tail |> List.filter f4 //4
        let f = f5 (d |> List.head) (a |> List.head) :: e //5
        f6 f //6
    
    if (x.Length > 0) then toString(_americanSoundex) 
    else "0000"

["Robert"; "Rupert"; "Robbert"; "Rubin"; 
 "Beer"; "Bear"; "Bearer"; 
 "Smith"; "Smyth";
 "Ashcraft"; "Ashcroft";
  "Tymczak"; "Pfister"] 
 |> List.map (fun x -> (x, americanSoundex x)) |> List.iter (fun (x, y) -> printfn "%-8s = %s" x y)

(*

Robert   = R163
Rupert   = R163
Robbert  = R163
Rubin    = R150
Beer     = B600
Bear     = B600
Bearer   = B660
Smith    = S530
Smyth    = S530
Ashcraft = A261
Ashcroft = A261
Tymczak  = T522
Pfister  = P236

*)

Go

package myPackageName

import (
	"bytes"
	"strings"
	"fmt"
)

const codeLen = 4

var codes = map[string]string{
	"a": "",
	"b": "1",
	"c": "2",
	"d": "3",
	"e": "",
	"f": "1",
	"g": "2",
	"h": "",
	"i": "",
	"j": "2",
	"k": "2",
	"l": "4",
	"m": "5",
	"n": "5",
	"o": "",
	"p": "1",
	"q": "2",
	"r": "6",
	"s": "2",
	"t": "3",
	"u": "",
	"v": "1",
	"w": "",
	"x": "2",
	"y": "",
	"z": "2",
}

func Soundex(s string) string {
	var encoded bytes.Buffer
	encoded.WriteByte(s[0])

	for i := 1; i < len(s); i++ {
		if encoded.Len() == codeLen {
			break
		}

		previous, current := strings.ToLower(string(s[i-1])), strings.ToLower(string(s[i]))

		var next string
		if i+1 < len(s) {
			next = strings.ToLower(string(s[i+1]))
		}

		if (current == "h" || current == "w") && (codes[previous] == codes[next]) {
			i = i + 1
			continue
		}

		if c, ok := codes[current]; ok && len(c) > 0 {
			encoded.WriteByte(c[0])
		}

		if codes[current] == codes[next] {
			i = i + 1
			continue
		}
	}

	if encoded.Len() < codeLen {
		padding := strings.Repeat("0", codeLen-encoded.Len())
		encoded.WriteString(padding)
	}

	return strings.ToUpper(encoded.String())
}

Usage

func main() {
        /* J126 */
 	fmt.Println(Soundex("Javascript"))
}

Java

private static String getCode(char c){
  switch(c){
    case 'B': case 'F': case 'P': case 'V':
      return "1";
    case 'C': case 'G': case 'J': case 'K':
    case 'Q': case 'S': case 'X': case 'Z':
      return "2";
    case 'D': case 'T':
      return "3";
    case 'L':
      return "4";
    case 'M': case 'N':
      return "5";
    case 'R':
      return "6";
    default:
      return "";
  }
}
 
public static String soundex(String s){
  String code, previous, soundex;
  code = s.toUpperCase().charAt(0) + "";
  previous = "7";
  for(int i = 1;i < s.length();i++){
    String current = getCode(s.toUpperCase().charAt(i));
    if(current.length() > 0 && !current.equals(previous)){
      code = code + current;
    }
    previous = current;
  }
  soundex = (code + "0000").substring(0, 4);
  return soundex;
}

Usage

public static void main(String[] args){
    System.out.println(soundex("Soundex"));//S532
    System.out.println(soundex("Example"));//E251
    System.out.println(soundex("Sownteks"));//S532
    System.out.println(soundex("Ekzampul"));//E251
}

Javascript

var soundex = function(s) {
    var a = s.toLowerCase().split(''),
        f = a.shift(),
        r = '',
        codes = { a: '', e: '', i: '', o: '', u: '', b: 1, f: 1, p: 1, v: 1, c: 2, g: 2, j: 2, k: 2, q: 2, s: 2, x: 2, z: 2, d: 3, t: 3, l: 4, m: 5, n: 5, r: 6 };

    r = f +
        a
        .map(function(v, i, a) {
            return codes[v]
        })
        .filter(function(v, i, a) {
            return ((i === 0) ? v !== codes[f] : v !== a[i - 1]);
        })
        .join('');

    return (r + '000').slice(0, 4).toUpperCase();
};

Usage

soundex("Javascript") == soundex("Jabascript"); // True as J126 == J126

Objective-C

You can found the implementation of the Soundex algorithm Objective-C in this github gist , written by Darkseed.

PHP

PHP has already soundex as a built-in function that calculates the soundex key of a string.

Usage

soundex("PHP Server Language") == soundex("PHP Serber language"); // True as P100 == P100 

Python

Function

def get_soundex(name):
	"""Get the soundex code for the string"""
	name = name.upper()

	soundex = ""
	soundex += name[0]

	dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4", "MN":"5", "R":"6", "AEIOUHWY":"."}

	for char in name[1:]:
		for key in dictionary.keys():
			if char in key:
				code = dictionary[key]
				if code != soundex[-1]:
					soundex += code

	soundex = soundex.replace(".", "")
	soundex = soundex[:4].ljust(4, "0")

	return soundex

Usage

    list = ["Smith", "Smythe", "Robert", "Rupert", "Schultz", "Shultz"]

	print("NAME\t\tSOUNDEX")
	for name in list:
		print("%s\t\t%s" % (name, get_soundex(name)))

Library

If you prefer to use a library, you can use the fuzzy package (which uses C Extensions (via Pyrex) for speed).

Ruby

class String
 
  SoundexChars = 'BFPVCGJKQSXZDTLMNR'
  SoundexNums  = '111122222222334556'
  SoundexCharsEx = '^' + SoundexChars
  SoundexCharsDel = '^A-Z'
 
  # desc: http://en.wikipedia.org/wiki/Soundex
  def soundex(census = true)
    str = self.upcase.delete(SoundexCharsDel)
    str[0,1] + str[1..-1].delete(SoundexCharsEx).
                          tr_s(SoundexChars, SoundexNums)\
                          [0 .. (census ? 2 : -1)].
                          ljust(3, '0') rescue ''
  end
 
  def sounds_like(other)
    self.soundex == other.soundex
  end
end

Usage

%w(Soundex Sownteks Example Ekzampul foo bar).each_slice(2) do |word1, word2|
  [word1, word2].each {|word| puts '%-8s -> %s' % [word, word.soundex]}
 
  print "'#{word1}' "
  print word1.sounds_like(word2) ? "sounds" : "does not sound"
  print " like '#{word2}'\n"
end

#Soundex  -> S532
#Sownteks -> S532
#'Soundex' sounds like 'Sownteks'
#Example  -> E251
#Ekzampul -> E251
#'Example' sounds like 'Ekzampul'
#foo      -> F000
#bar      -> B600
#'foo' does not sound like 'bar'

Scala

def soundex(s:String)={
   var code=s.head.toUpper.toString
   var previous=getCode(code.head)
   for(ch <- s.drop(1); current=getCode(ch.toUpper)){
      if (!current.isEmpty && current!=previous)
         code+=current
      previous=current
   }
   code+="0000"
   code.slice(0,4)
}
 
def getCode(c:Char)={
   val code=Map("1"->List('B','F','P','V'),
      "2"->List('C','G','J','K','Q','S','X','Z'),
      "3"->List('D', 'T'),
      "4"->List('L'),
      "5"->List('M', 'N'),
      "6"->List('R'))
 
   code.find(_._2.exists(_==c)) match {
      case Some((k,_)) => k
      case _ => ""
   }
}

Usage

def main(args: Array[String]): Unit = {
   val tests=Map(
      "Soundex"     -> "S532",
      "Euler"	    -> "E460",
      "Gauss"	    -> "G200",
      "Hilbert"	    -> "H416",
      "Knuth"	    -> "K530",
      "Lloyd"	    -> "L300",
      "Lukasiewicz" -> "L222",
      "Ellery"	    -> "E460",
      "Ghosh"	    -> "G200",
      "Heilbronn"   -> "H416",
      "Kant"	    -> "K530",
      "Ladd"	    -> "L300",
      "Lissajous"   -> "L222",
      "Wheaton"	    -> "W350",
      "Ashcraft"    -> "A226",
      "Burroughs"   -> "B622",
      "Burrows"	    -> "B620",
      "O'Hara"	    -> "O600")
 
   tests.foreach{(v)=>
      val code=soundex(v._1)
      val status=if (code==v._2) "OK" else "ERROR"
      printf("Name: %-20s  Code: %s   Found: %s  - %s\n", v._1, v._2, code, status)
   }
}

Swift

The class wrriten by clifford in this github repository is the implementation of the original Soundex algorithm in the Swift language.

//
//  Soundex.swift
//  speller
//
//  Created by Clifford Helsel on 4/28/16.
//
//  Based on standard Soundex algorithm and loosely ported from Apache Commons
//  https://commons.apache.org/proper/commons-codec/apidocs/src-html/org/apache/commons/codec/language/Soundex.html


public class Soundex {
    
    private static let en_mapping_string = Array("01230120022455012623010202".characters)
    private static let en_alphabet = Array("ABCDEFGHIJKLMNOPQRSTUVWXYZ".characters)
    private let mapping: [Character:Character] = Soundex.buildMapping(codes:en_alphabet,alphabet:en_mapping_string)
    
    private static func buildMapping(codes: Array<Character>, alphabet: Array<Character>) -> [Character:Character] {
        var retval: [Character:Character] = [:]
        for (index,code) in codes.enumerated() {
            retval[code] = alphabet[index]
        }
        return retval
    }
    
    private var soundexMapping: Array<Character> = Array(repeating:" ",count:4)
    
    private func getMappingCode(s: String, index:Int) -> Character {
        let i = s.index(s.startIndex, offsetBy: index)
        
        let mappedChar = mapChar(c:s[i])
        
        if (index>1 && !(mappedChar=="0"))
        {
            let j = s.index(s.startIndex,offsetBy:index-1)
            
            let hwChar = s[j]
            
            if (hwChar=="H" || hwChar=="W")
            {
                let k = s.index(s.startIndex,offsetBy:index-2)
                let prehwChar = s[k]
                let firstCode = mapChar(c:prehwChar)
                if (firstCode==mappedChar || "H"==prehwChar || "W"==prehwChar) {
                    return "0"
                }
            }
        }
        
        return mappedChar
    }
    
    private func mapChar(c: Character) -> Character {
        if let val = mapping[c] {
            return val
        }
        return "0" // not specified in original Soundex specification, if character is not found, code is 0
    }
    
    public func soundex(of: String) -> String {
        
        guard (of.characters.count>0) else {
            return ""
        }
        
        let str=of.uppercased()
        
        var out: Array<Character> = Array("    ".characters)
        var last: Character = " "
        var mapped: Character = " "
        var incount=1
        var count = 1

        out[0]=str[str.startIndex]
        last = getMappingCode(s:str, index: 0)
        while (incount < str.characters.count && count < out.count) {
            mapped = getMappingCode(s:str, index: incount)
            incount += 1
            if (mapped != "0") {
                if (mapped != "0" && mapped != last) {
                    out[count]=mapped
                    count += 1
                }
            }
        }
        return String(out)
    }
}

Usage

let c = Soundex()

c.soundex(of:"Christopher") // C631

VBScript

Function getCode(c)
    Select Case c
        Case "B", "F", "P", "V"
            getCode = "1"
        Case "C", "G", "J", "K", "Q", "S", "X", "Z"
            getCode = "2"
        Case "D", "T"
            getCode = "3"
        Case "L"
            getCode = "4"
        Case "M", "N"
            getCode = "5"
        Case "R"
            getCode = "6"
    End Select
End Function
 
Function soundex(s)
    Dim code, previous
    code = UCase(Mid(s, 1, 1))
    previous = 7
    For i = 2 to (Len(s) + 1)
        current = getCode(UCase(Mid(s, i, 1)))
        If Len(current) > 0 And current <> previous Then
            code = code & current
        End If
        previous = current
    Next
    soundex = Mid(code, 1, 4)
    If Len(code) < 4 Then
        soundex = soundex & String(4 - Len(code), "0")
    End If
End Function

Finally, if you know the implementation of the Soundex algorithm in another language (or you have a better snippet of it in the present languages) don't be shy and share it with us in the comment box, have fun !

Become a more social person