I am struggling to find/create an algorithm that can determine the pronounceability of random 5 letter combinations.
The closest thing I've found so far is from this 3 year old StackOverflow thread:
Measure the pronounceability of a word?
<?php
// Score: 1
echo pronounceability('namelet') . "\n";
// Score: 0.71428571428571
echo pronounceability('nameoic') . "\n";
function pronounceability($word) {
static $vowels = array
(
'a',
'e',
'i',
'o',
'u',
'y'
);
static $composites = array
(
'mm',
'll',
'th',
'ing'
);
if (!is_string($word)) return false;
// Remove non letters and put in lowercase
$word = preg_replace('/[^a-z]/i', '', $word);
$word = strtolower($word);
// Special case
if ($word == 'a') return 1;
$len = strlen($word);
// Let's not parse an empty string
if ($len == 0) return 0;
$score = 0;
$pos = 0;
while ($pos < $len) {
// Check if is allowed composites
foreach ($composites as $comp) {
$complen = strlen($comp);
if (($pos + $complen) < $len) {
$check = substr($word, $pos, $complen);
if ($check == $comp) {
$score += $complen;
$pos += $complen;
continue 2;
}
}
}
// Is it a vowel? If so, check if previous wasn't a vowel too.
if (in_array($word[$pos], $vowels)) {
if (($pos - 1) >= 0 && !in_array($word[$pos - 1], $vowels)) {
$score += 1;
$pos += 1;
continue;
}
} else { // Not a vowel, check if next one is, or if is end of word
if (($pos + 1) < $len && in_array($word[$pos + 1], $vowels)) {
$score += 2;
$pos += 2;
continue;
} elseif (($pos + 1) == $len) {
$score += 1;
break;
}
}
$pos += 1;
}
return $score / $len;
}
?>
... but it is far from perfect, giving some rather strange false positives:
Using this function, all of the following rate as pronounceable, (above 7/10)
Can someone smarter than me tweek this algorithm perhaps so that:
(I have done a fair amount of research/googling, and this seems to be the main pronounceability function that everyone has been referencing/using for the last 3 years, so I'm sure an updated, more refined version would be appreciated by the wider community, not just me!).
Based on a suggestion on the linked question to "Use a Markov model on letters"
Use a Markov model (on letters, not words, of course). The probability of a word is a pretty good proxy for ease of pronunciation.
I thought I would try it out and had some success.
I copied a list of real 5-letter words into a file to serve as my dataset (here...um, actually here).
Then I use a Hidden Markov model (based on One-grams, Bi-grams, and Tri-grams) to predict how likely a target word would appear in that dataset.
(Better results could be achieved with some sort of phonetic transcription as one of the steps.)
First, I calculate the probabilities of character sequences in the dataset.
For example, if 'A' occurs 50 times, and there is only 250 characters in the dataset, then 'A' has a 50/250 or .2 probability.
Do the same for the bigrams 'AB', 'AC', ...
Do the same for the trigrams 'ABC', 'ABD', ...
Basically, my score for the word "ABCDE" is composed of:
You could multiply all of these together to get the estimated probability of the target word appearing in the dataset, (but that is very small).
So instead, we take the logs of each and add them together.
Now we have a score which estimates how likely our target word would appear in the dataset.
I have coded this is C#, and find that a score greater than negative 160 is pretty good.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
namespace Pronouncability
{
class Program
{
public static char[] alphabet = new char[]{ 'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z' };
public static List<string> wordList = loadWordList(); //Dataset of 5-letter words
public static Random rand = new Random();
public const double SCORE_LIMIT = -160.00;
/// <summary>
/// Generates random words, until 100 of them are better than
/// the SCORE_LIMIT based on a statistical score.
/// </summary>
public static void Main(string[] args)
{
Dictionary<Tuple<char, char, char>, int> trigramCounts = new Dictionary<Tuple<char, char, char>, int>();
Dictionary<Tuple<char, char>, int> bigramCounts = new Dictionary<Tuple<char, char>, int>();
Dictionary<char, int> onegramCounts = new Dictionary<char, int>();
calculateProbabilities(onegramCounts, bigramCounts, trigramCounts);
double totalTrigrams = (double)trigramCounts.Values.Sum();
double totalBigrams = (double)bigramCounts.Values.Sum();
double totalOnegrams = (double)onegramCounts.Values.Sum();
SortedList<double, string> randomWordsScores = new SortedList<double, string>();
while( randomWordsScores.Count < 100 )
{
string randStr = getRandomWord();
if (!randomWordsScores.ContainsValue(randStr))
{
double score = getLikelyhood(randStr,trigramCounts, bigramCounts, onegramCounts, totalTrigrams, totalBigrams, totalOnegrams);
if (score > SCORE_LIMIT)
{
randomWordsScores.Add(score, randStr);
}
}
}
//Right now randomWordsScores contains 100 random words which have
//a better score than the SCORE_LIMIT, sorted from worst to best.
}
/// <summary>
/// Generates a random 5-letter word
/// </summary>
public static string getRandomWord()
{
char c0 = (char)rand.Next(65, 90);
char c1 = (char)rand.Next(65, 90);
char c2 = (char)rand.Next(65, 90);
char c3 = (char)rand.Next(65, 90);
char c4 = (char)rand.Next(65, 90);
return "" + c0 + c1 + c2 + c3 + c4;
}
/// <summary>
/// Returns a score for how likely a given word is, based on given trigrams, bigrams, and one-grams
/// </summary>
public static double getLikelyhood(string wordToScore, Dictionary<Tuple<char, char,char>, int> trigramCounts, Dictionary<Tuple<char, char>, int> bigramCounts, Dictionary<char, int> onegramCounts, double totalTrigrams, double totalBigrams, double totalOnegrams)
{
wordToScore = wordToScore.ToUpper();
char[] letters = wordToScore.ToCharArray();
Tuple<char, char>[] bigrams = new Tuple<char, char>[]{
new Tuple<char,char>( wordToScore[0], wordToScore[1] ),
new Tuple<char,char>( wordToScore[1], wordToScore[2] ),
new Tuple<char,char>( wordToScore[2], wordToScore[3] ),
new Tuple<char,char>( wordToScore[3], wordToScore[4] )
};
Tuple<char, char, char>[] trigrams = new Tuple<char, char, char>[]{
new Tuple<char,char,char>( wordToScore[0], wordToScore[1], wordToScore[2] ),
new Tuple<char,char,char>( wordToScore[1], wordToScore[2], wordToScore[3] ),
new Tuple<char,char,char>( wordToScore[2], wordToScore[3], wordToScore[4] ),
};
double score = 0;
foreach (char c in letters)
{
score += Math.Log((((double)onegramCounts[c]) / totalOnegrams));
}
foreach (Tuple<char, char> pair in bigrams)
{
score += Math.Log((((double)bigramCounts[pair]) / totalBigrams));
}
foreach (Tuple<char, char, char> trio in trigrams)
{
score += 5.0*Math.Log((((double)trigramCounts[trio]) / totalTrigrams));
}
return score;
}
/// <summary>
/// Build the probability tables based on the dataset (WordList)
/// </summary>
public static void calculateProbabilities(Dictionary<char, int> onegramCounts, Dictionary<Tuple<char, char>, int> bigramCounts, Dictionary<Tuple<char, char, char>, int> trigramCounts)
{
foreach (char c1 in alphabet)
{
foreach (char c2 in alphabet)
{
foreach( char c3 in alphabet)
{
trigramCounts[new Tuple<char, char, char>(c1, c2, c3)] = 1;
}
}
}
foreach( char c1 in alphabet)
{
foreach( char c2 in alphabet)
{
bigramCounts[ new Tuple<char,char>(c1,c2) ] = 1;
}
}
foreach (char c1 in alphabet)
{
onegramCounts[c1] = 1;
}
foreach (string word in wordList)
{
for (int pos = 0; pos < 3; pos++)
{
trigramCounts[new Tuple<char, char, char>(word[pos], word[pos + 1], word[pos + 2])]++;
}
for (int pos = 0; pos < 4; pos++)
{
bigramCounts[new Tuple<char, char>(word[pos], word[pos + 1])]++;
}
for (int pos = 0; pos < 5; pos++)
{
onegramCounts[word[pos]]++;
}
}
}
/// <summary>
/// Get the dataset (WordList) from file.
/// </summary>
public static List<string> loadWordList()
{
string filePath = "WordList.txt";
string text = File.ReadAllText(filePath);
List<string> result = text.Split(' ').ToList();
return result;
}
}
}
In my example, I scale the trigram probabilities by 5.
I also add one to all of the counts, so we don't multiply by zero.
I'm not a php programmer, but the technique is pretty easy to implement.
Play around with some scaling factors, try different datasets, or add in some other checks like what you suggested above.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With