Soundex is a phonetic algorithm for indexing names by sound, as pronounced in English, SOUNDEX codes from different strings can be compared to see how similar the strings sound when spoken.
The first character of the code is the first character of the expression, converted to upper case. The second through fourth characters of the code are numbers that represent the letters in the expression. The letters A, E, I, O, U, H, W, and Y are ignored unless they are the first letter of the string. All international alphabetic characters outside the A-Z range are treated as vowels. Hence, two strings that sound almost the same should have identical soundex strings. For instance, the words "text" and "tixt" both produce a soundex of “T230”.
In this article, you'll find its implementation in the following programming languages :
Let's get started !
C
#include <stdio.h>
static char code[128] = { 0 };
const char* soundex(const char *s)
{
static char out[5];
int c, prev, i;
out[0] = out[4] = 0;
if (!s || !*s) return out;
out[0] = *s++;
/* first letter, though not coded, can still affect next letter: Pfister */
prev = code[(int)out[0]];
for (i = 1; *s && i < 4; s++) {
if ((c = code[(int)*s]) == prev) continue;
if (c == -1) prev = 0; /* vowel as separator */
else if (c > 0) {
out[i++] = c + '0';
prev = c;
}
}
while (i < 4) out[i++] = '0';
return out;
}
void add_code(const char *s, int c)
{
while (*s) {
code[(int)*s] = code[0x20 ^ (int)*s] = c;
s++;
}
}
void init()
{
static const char *cls[] =
{ "AEIOU", "", "BFPV", "CGJKQSXZ", "DT", "L", "MN", "R", 0};
int i;
for (i = 0; cls[i]; i++)
add_code(cls[i], i - 1);
}
Usage
int main()
{
init();
/* J126 */
printf(soundex("Javascript"));
return 0;
}
C#
using System.Text.RegularExpressions;
public static class Soundex
{
public static string For(string word)
{
const int MaxSoundexCodeLength = 4;
var soundexCode = new StringBuilder();
var previousWasHOrW = false;
word = Regex.Replace(
word == null ? string.Empty : word.ToUpper(),
@"[^\w\s]",
string.Empty);
if (string.IsNullOrEmpty(word))
return string.Empty.PadRight(MaxSoundexCodeLength, '0');
soundexCode.Append(word.First());
for (var i = 1; i < word.Length; i++)
{
var numberCharForCurrentLetter =
GetCharNumberForLetter(word[i]);
if (i == 1 &&
numberCharForCurrentLetter ==
GetCharNumberForLetter(soundexCode[0]))
continue;
if (soundexCode.Length > 2 && previousWasHOrW &&
numberCharForCurrentLetter ==
soundexCode[soundexCode.Length - 2])
continue;
if (soundexCode.Length > 0 &&
numberCharForCurrentLetter ==
soundexCode[soundexCode.Length - 1])
continue;
soundexCode.Append(numberCharForCurrentLetter);
previousWasHOrW = "HW".Contains(word[i]);
}
return soundexCode
.Replace("0", string.Empty)
.ToString()
.PadRight(MaxSoundexCodeLength, '0')
.Substring(0, MaxSoundexCodeLength);
}
private static char GetCharNumberForLetter(char letter)
{
if ("BFPV".Contains(letter)) return '1';
if ("CGJKQSXZ".Contains(letter)) return '2';
if ("DT".Contains(letter)) return '3';
if ('L' == letter) return '4';
if ("MN".Contains(letter)) return '5';
if ('R' == letter) return '6';
return '0';
}
}
Usage
Soundex.For("CSharp Language") == Soundex.For("CSherp Language"); // True as C614 == C614
D
The D standard library (Phobos) contains already a soundex function.
import std.stdio: writeln;
import std.string: soundex;
void main() {
assert(soundex("soundex") == "S532");
assert(soundex("example") == "E251");
assert(soundex("ciondecks") == "C532");
assert(soundex("ekzampul") == "E251");
assert(soundex("Robert") == "R163");
assert(soundex("Rupert") == "R163");
assert(soundex("Rubin") == "R150");
assert(soundex("Ashcraft") == "A261");
assert(soundex("Ashcroft") == "A261");
assert(soundex("Tymczak") == "T522");
}
F#
let americanSoundex (x : string) =
let toString (xs : char list) = new System.String(xs |> Array.ofList)
let _americanSoundex =
let toUpper (x : string) = x.ToUpper()
let toArray (x : string) = x.ToCharArray()
let f1 ch =
match ch with
| 'H' | 'W' -> false
| _ -> true
let f2 ch =
match ch with
| 'B' | 'F' | 'P' | 'V' -> '1'
| 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' -> '2'
| 'D' | 'T' -> '3'
| 'L' -> '4'
| 'M' | 'N' -> '5'
| 'R' -> '6'
| _ -> ch
let rec f3 xs =
match xs with
| h0 :: h1 :: t -> h0 :: f3 (if (h0 = h1) then t else (h1 :: t))
| h :: _ -> [h]
| _ -> []
let f4 ch =
match ch with
| 'A' | 'E' | 'I' | 'O' | 'U' | 'Y' -> false
| _ -> true
let f5 ch first =
if ('0' <= ch && ch <= '9') then first
else ch
let f6 xs =
let len = List.length xs
seq{for i = 0 to 3 - len do yield '0'}
|> Seq.append (xs |> Seq.take (System.Math.Min(4, len)))
|> Seq.toList
let a = x |> toUpper |> toArray |> Array.toList
let b = a |> List.filter f1 //1
let c = b |> List.map f2 //2
let d = c |> f3 //3
let e = d |> List.tail |> List.filter f4 //4
let f = f5 (d |> List.head) (a |> List.head) :: e //5
f6 f //6
if (x.Length > 0) then toString(_americanSoundex)
else "0000"
["Robert"; "Rupert"; "Robbert"; "Rubin";
"Beer"; "Bear"; "Bearer";
"Smith"; "Smyth";
"Ashcraft"; "Ashcroft";
"Tymczak"; "Pfister"]
|> List.map (fun x -> (x, americanSoundex x)) |> List.iter (fun (x, y) -> printfn "%-8s = %s" x y)
(*
Robert = R163
Rupert = R163
Robbert = R163
Rubin = R150
Beer = B600
Bear = B600
Bearer = B660
Smith = S530
Smyth = S530
Ashcraft = A261
Ashcroft = A261
Tymczak = T522
Pfister = P236
*)
Go
package myPackageName
import (
"bytes"
"strings"
"fmt"
)
const codeLen = 4
var codes = map[string]string{
"a": "",
"b": "1",
"c": "2",
"d": "3",
"e": "",
"f": "1",
"g": "2",
"h": "",
"i": "",
"j": "2",
"k": "2",
"l": "4",
"m": "5",
"n": "5",
"o": "",
"p": "1",
"q": "2",
"r": "6",
"s": "2",
"t": "3",
"u": "",
"v": "1",
"w": "",
"x": "2",
"y": "",
"z": "2",
}
func Soundex(s string) string {
var encoded bytes.Buffer
encoded.WriteByte(s[0])
for i := 1; i < len(s); i++ {
if encoded.Len() == codeLen {
break
}
previous, current := strings.ToLower(string(s[i-1])), strings.ToLower(string(s[i]))
var next string
if i+1 < len(s) {
next = strings.ToLower(string(s[i+1]))
}
if (current == "h" || current == "w") && (codes[previous] == codes[next]) {
i = i + 1
continue
}
if c, ok := codes[current]; ok && len(c) > 0 {
encoded.WriteByte(c[0])
}
if codes[current] == codes[next] {
i = i + 1
continue
}
}
if encoded.Len() < codeLen {
padding := strings.Repeat("0", codeLen-encoded.Len())
encoded.WriteString(padding)
}
return strings.ToUpper(encoded.String())
}
Usage
func main() {
/* J126 */
fmt.Println(Soundex("Javascript"))
}
Java
private static String getCode(char c){
switch(c){
case 'B': case 'F': case 'P': case 'V':
return "1";
case 'C': case 'G': case 'J': case 'K':
case 'Q': case 'S': case 'X': case 'Z':
return "2";
case 'D': case 'T':
return "3";
case 'L':
return "4";
case 'M': case 'N':
return "5";
case 'R':
return "6";
default:
return "";
}
}
public static String soundex(String s){
String code, previous, soundex;
code = s.toUpperCase().charAt(0) + "";
previous = "7";
for(int i = 1;i < s.length();i++){
String current = getCode(s.toUpperCase().charAt(i));
if(current.length() > 0 && !current.equals(previous)){
code = code + current;
}
previous = current;
}
soundex = (code + "0000").substring(0, 4);
return soundex;
}
Usage
public static void main(String[] args){
System.out.println(soundex("Soundex"));//S532
System.out.println(soundex("Example"));//E251
System.out.println(soundex("Sownteks"));//S532
System.out.println(soundex("Ekzampul"));//E251
}
Javascript
var soundex = function(s) {
var a = s.toLowerCase().split(''),
f = a.shift(),
r = '',
codes = { a: '', e: '', i: '', o: '', u: '', b: 1, f: 1, p: 1, v: 1, c: 2, g: 2, j: 2, k: 2, q: 2, s: 2, x: 2, z: 2, d: 3, t: 3, l: 4, m: 5, n: 5, r: 6 };
r = f +
a
.map(function(v, i, a) {
return codes[v]
})
.filter(function(v, i, a) {
return ((i === 0) ? v !== codes[f] : v !== a[i - 1]);
})
.join('');
return (r + '000').slice(0, 4).toUpperCase();
};
Usage
soundex("Javascript") == soundex("Jabascript"); // True as J126 == J126
Objective-C
You can found the implementation of the Soundex algorithm Objective-C in this github gist , written by Darkseed.
PHP
PHP has already soundex as a built-in function that calculates the soundex key of a string.
Usage
soundex("PHP Server Language") == soundex("PHP Serber language"); // True as P100 == P100
Python
Function
def get_soundex(name):
"""Get the soundex code for the string"""
name = name.upper()
soundex = ""
soundex += name[0]
dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4", "MN":"5", "R":"6", "AEIOUHWY":"."}
for char in name[1:]:
for key in dictionary.keys():
if char in key:
code = dictionary[key]
if code != soundex[-1]:
soundex += code
soundex = soundex.replace(".", "")
soundex = soundex[:4].ljust(4, "0")
return soundex
Usage
list = ["Smith", "Smythe", "Robert", "Rupert", "Schultz", "Shultz"]
print("NAME\t\tSOUNDEX")
for name in list:
print("%s\t\t%s" % (name, get_soundex(name)))
Library
If you prefer to use a library, you can use the fuzzy package (which uses C Extensions (via Pyrex) for speed).
Ruby
class String
SoundexChars = 'BFPVCGJKQSXZDTLMNR'
SoundexNums = '111122222222334556'
SoundexCharsEx = '^' + SoundexChars
SoundexCharsDel = '^A-Z'
# desc: http://en.wikipedia.org/wiki/Soundex
def soundex(census = true)
str = self.upcase.delete(SoundexCharsDel)
str[0,1] + str[1..-1].delete(SoundexCharsEx).
tr_s(SoundexChars, SoundexNums)\
[0 .. (census ? 2 : -1)].
ljust(3, '0') rescue ''
end
def sounds_like(other)
self.soundex == other.soundex
end
end
Usage
%w(Soundex Sownteks Example Ekzampul foo bar).each_slice(2) do |word1, word2|
[word1, word2].each {|word| puts '%-8s -> %s' % [word, word.soundex]}
print "'#{word1}' "
print word1.sounds_like(word2) ? "sounds" : "does not sound"
print " like '#{word2}'\n"
end
#Soundex -> S532
#Sownteks -> S532
#'Soundex' sounds like 'Sownteks'
#Example -> E251
#Ekzampul -> E251
#'Example' sounds like 'Ekzampul'
#foo -> F000
#bar -> B600
#'foo' does not sound like 'bar'
Scala
def soundex(s:String)={
var code=s.head.toUpper.toString
var previous=getCode(code.head)
for(ch <- s.drop(1); current=getCode(ch.toUpper)){
if (!current.isEmpty && current!=previous)
code+=current
previous=current
}
code+="0000"
code.slice(0,4)
}
def getCode(c:Char)={
val code=Map("1"->List('B','F','P','V'),
"2"->List('C','G','J','K','Q','S','X','Z'),
"3"->List('D', 'T'),
"4"->List('L'),
"5"->List('M', 'N'),
"6"->List('R'))
code.find(_._2.exists(_==c)) match {
case Some((k,_)) => k
case _ => ""
}
}
Usage
def main(args: Array[String]): Unit = {
val tests=Map(
"Soundex" -> "S532",
"Euler" -> "E460",
"Gauss" -> "G200",
"Hilbert" -> "H416",
"Knuth" -> "K530",
"Lloyd" -> "L300",
"Lukasiewicz" -> "L222",
"Ellery" -> "E460",
"Ghosh" -> "G200",
"Heilbronn" -> "H416",
"Kant" -> "K530",
"Ladd" -> "L300",
"Lissajous" -> "L222",
"Wheaton" -> "W350",
"Ashcraft" -> "A226",
"Burroughs" -> "B622",
"Burrows" -> "B620",
"O'Hara" -> "O600")
tests.foreach{(v)=>
val code=soundex(v._1)
val status=if (code==v._2) "OK" else "ERROR"
printf("Name: %-20s Code: %s Found: %s - %s\n", v._1, v._2, code, status)
}
}
Swift
The class wrriten by clifford in this github repository is the implementation of the original Soundex algorithm in the Swift language.
//
// Soundex.swift
// speller
//
// Created by Clifford Helsel on 4/28/16.
//
// Based on standard Soundex algorithm and loosely ported from Apache Commons
// https://commons.apache.org/proper/commons-codec/apidocs/src-html/org/apache/commons/codec/language/Soundex.html
public class Soundex {
private static let en_mapping_string = Array("01230120022455012623010202".characters)
private static let en_alphabet = Array("ABCDEFGHIJKLMNOPQRSTUVWXYZ".characters)
private let mapping: [Character:Character] = Soundex.buildMapping(codes:en_alphabet,alphabet:en_mapping_string)
private static func buildMapping(codes: Array<Character>, alphabet: Array<Character>) -> [Character:Character] {
var retval: [Character:Character] = [:]
for (index,code) in codes.enumerated() {
retval[code] = alphabet[index]
}
return retval
}
private var soundexMapping: Array<Character> = Array(repeating:" ",count:4)
private func getMappingCode(s: String, index:Int) -> Character {
let i = s.index(s.startIndex, offsetBy: index)
let mappedChar = mapChar(c:s[i])
if (index>1 && !(mappedChar=="0"))
{
let j = s.index(s.startIndex,offsetBy:index-1)
let hwChar = s[j]
if (hwChar=="H" || hwChar=="W")
{
let k = s.index(s.startIndex,offsetBy:index-2)
let prehwChar = s[k]
let firstCode = mapChar(c:prehwChar)
if (firstCode==mappedChar || "H"==prehwChar || "W"==prehwChar) {
return "0"
}
}
}
return mappedChar
}
private func mapChar(c: Character) -> Character {
if let val = mapping[c] {
return val
}
return "0" // not specified in original Soundex specification, if character is not found, code is 0
}
public func soundex(of: String) -> String {
guard (of.characters.count>0) else {
return ""
}
let str=of.uppercased()
var out: Array<Character> = Array(" ".characters)
var last: Character = " "
var mapped: Character = " "
var incount=1
var count = 1
out[0]=str[str.startIndex]
last = getMappingCode(s:str, index: 0)
while (incount < str.characters.count && count < out.count) {
mapped = getMappingCode(s:str, index: incount)
incount += 1
if (mapped != "0") {
if (mapped != "0" && mapped != last) {
out[count]=mapped
count += 1
}
}
}
return String(out)
}
}
Usage
let c = Soundex()
c.soundex(of:"Christopher") // C631
VBScript
Function getCode(c)
Select Case c
Case "B", "F", "P", "V"
getCode = "1"
Case "C", "G", "J", "K", "Q", "S", "X", "Z"
getCode = "2"
Case "D", "T"
getCode = "3"
Case "L"
getCode = "4"
Case "M", "N"
getCode = "5"
Case "R"
getCode = "6"
End Select
End Function
Function soundex(s)
Dim code, previous
code = UCase(Mid(s, 1, 1))
previous = 7
For i = 2 to (Len(s) + 1)
current = getCode(UCase(Mid(s, i, 1)))
If Len(current) > 0 And current <> previous Then
code = code & current
End If
previous = current
Next
soundex = Mid(code, 1, 4)
If Len(code) < 4 Then
soundex = soundex & String(4 - Len(code), "0")
End If
End Function
Finally, if you know the implementation of the Soundex algorithm in another language (or you have a better snippet of it in the present languages) don't be shy and share it with us in the comment box, have fun !