Good day. The other day I had a task to implement the algorithm for post-processing the results of optical text recognition. To solve this problem, one of the models for checking spelling in the text was not bad, although of course slightly modified under the context of the problem. This post will be devoted to the Noisy Channel model , which allows you to perform automatic spell checking, we will study the mathematical model, write some code on c #, train the model based on Peter Norvig , and finally test what we will have.


public static int LevenshteinDistance(string s, string t) { int[,] d = new int[s.Length + 1, t.Length + 1]; for (int i = 0; i < s.Length + 1; i++) { d[i, 0] = i; } for (int j = 1; j < t.Length + 1; j++) { d[0, j] = j; } for (int i = 1; i <= s.Length; i++) { for (int j = 1; j <= t.Length; j++) { d[i, j] = (new int[] { d[i - 1, j] + 1, // del d[i, j - 1] + 1, // ins d[i - 1, j - 1] + (s[i - 1] == t[j - 1] ? 0 : 1) // sub }).Min(); } } return d[s.Length, t.Length]; } | d | b (0 - delete from source , left; 1 - insert from target to source ; 2 - replace symbol in source with symbol from target ) | 
![]()  | ![]()  | 
 d[i, j] = (new int[] { d[i - 1, j] + 1, // del - 0 d[i, j - 1] + 1, // ins - 1 d[i - 1, j - 1] + (s[i - 1] == t[j - 1] ? 0 : 1) // sub - 2 }).Min();  IList<int> vals = new List<int>() { d[i - 1, j] + 1, // del d[i, j - 1] + 1, // ins d[i - 1, j - 1] + (s[i - 1] == t[j - 1] ? 0 : 1) // sub }; d[i, j] = vals.Min(); int idx = vals.IndexOf(d[i, j]); b[i, j] = idx;  //del - 0, ins - 1, sub - 2 public static Tuple<int, IList<Tuple<int, string>>> LevenshteinDistanceWithBacktrace(string s, string t) { int[,] d = new int[s.Length + 1, t.Length + 1]; int[,] b = new int[s.Length + 1, t.Length + 1]; for (int i = 0; i < s.Length + 1; i++) { d[i, 0] = i; } for (int j = 1; j < t.Length + 1; j++) { d[0, j] = j; b[0, j] = 1; } for (int i = 1; i <= s.Length; i++) { for (int j = 1; j <= t.Length; j++) { IList<int> vals = new List<int>() { d[i - 1, j] + 1, // del d[i, j - 1] + 1, // ins d[i - 1, j - 1] + (s[i - 1] == t[j - 1] ? 0 : 1) // sub }; d[i, j] = vals.Min(); int idx = vals.IndexOf(d[i, j]); b[i, j] = idx; } } List<Tuple<int, string>> bt = new List<Tuple<int, string>>(); int x = s.Length; int y = t.Length; while (bt.Count != d[s.Length, t.Length]) { switch (b[x, y]) { case 0: x--; bt.Add(new Tuple<int, string>(0, s[x].ToString())); break; case 1: y--; bt.Add(new Tuple<int, string>(1, t[y].ToString())); break; case 2: x--; y--; if (s[x] != t[y]) { bt.Add(new Tuple<int, string>(2, s[x] + "" + t[y])); } break; } } bt.Reverse(); return new Tuple<int, IList<Tuple<int, string>>>(d[s.Length, t.Length], bt); } 




 public class NoisyChannel { #region vars private string[] _words = null; private double[] _wLogPriors = null; private IDictionary<int, IList<int>> _wordLengthDictionary = null; //length of word - word indices private IDictionary<int, IDictionary<string, double>> _mistakeLogProbs = null; private double _lf = 1d; private IDictionary<int, int> _mNorms = null; #endregion #region ctor public NoisyChannel(string[] words, long[] wordFrequency, IDictionary<int, IDictionary<string, int>> mistakeFrequency, int mistakeProbSmoothing = 1) { _words = words; _wLogPriors = new double[_words.Length]; _wordLengthDictionary = new SortedDictionary<int, IList<int>>(); double wNorm = wordFrequency.Sum(); for (int i = 0; i < _words.Length; i++) { _wLogPriors[i] = Math.Log((wordFrequency[i] + 0d)/wNorm); int wl = _words[i].Length; if (!_wordLengthDictionary.ContainsKey(wl)) { _wordLengthDictionary.Add(wl, new List<int>()); } _wordLengthDictionary[wl].Add(i); } _lf = mistakeProbSmoothing; _mistakeLogProbs = new Dictionary<int, IDictionary<string, double>>(); _mNorms = new Dictionary<int, int>(); foreach (int mType in mistakeFrequency.Keys) { int mNorm = mistakeFrequency[mType].Sum(m => m.Value); _mNorms.Add(mType, mNorm); int mUnique = mistakeFrequency[mType].Count; _mistakeLogProbs.Add(mType, new Dictionary<string, double>()); foreach (string m in mistakeFrequency[mType].Keys) { _mistakeLogProbs[mType].Add(m, Math.Log((mistakeFrequency[mType][m] + _lf)/ (mNorm + _lf*mUnique)) ); } } } #endregion #region correction public IDictionary<string, double> GetCandidates(string s, int maxEditDistance = 2) { IDictionary<string, double> candidates = new Dictionary<string, double>(); IList<int> dists = new List<int>(); for (int i = s.Length - maxEditDistance; i <= s.Length + maxEditDistance; i++) { if (i >= 0) { dists.Add(i); } } foreach (int dist in dists) { foreach (int tIdx in _wordLengthDictionary[dist]) { string t = _words[tIdx]; Tuple<int, IList<Tuple<int, string>>> d = LevenshteinDistanceWithBacktrace(s, t); if (d.Item1 > maxEditDistance) { continue; } double p = _wLogPriors[tIdx]; foreach (Tuple<int, string> m in d.Item2) { if (!_mistakeLogProbs[m.Item1].ContainsKey(m.Item2)) { p += _lf/(_mNorms[m.Item1] + _lf*_mistakeLogProbs[m.Item1].Count); } else { p += _mistakeLogProbs[m.Item1][m.Item2]; } } candidates.Add(_words[tIdx], p); } } candidates = candidates.OrderByDescending(c => c.Value).ToDictionary(c => c.Key, c => c.Value); return candidates; } #endregion #region static helper //del - 0, ins - 1, sub - 2 public static Tuple<int, IList<Tuple<int, string>>> LevenshteinDistanceWithBacktrace(string s, string t) { int[,] d = new int[s.Length + 1, t.Length + 1]; int[,] b = new int[s.Length + 1, t.Length + 1]; for (int i = 0; i < s.Length + 1; i++) { d[i, 0] = i; } for (int j = 1; j < t.Length + 1; j++) { d[0, j] = j; b[0, j] = 1; } for (int i = 1; i <= s.Length; i++) { for (int j = 1; j <= t.Length; j++) { IList<int> vals = new List<int>() { d[i - 1, j] + 1, // del d[i, j - 1] + 1, // ins d[i - 1, j - 1] + (s[i - 1] == t[j - 1] ? 0 : 1) // sub }; d[i, j] = vals.Min(); int idx = vals.IndexOf(d[i, j]); b[i, j] = idx; } } List<Tuple<int, string>> bt = new List<Tuple<int, string>>(); int x = s.Length; int y = t.Length; while (bt.Count != d[s.Length, t.Length]) { switch (b[x, y]) { case 0: x--; bt.Add(new Tuple<int, string>(0, s[x].ToString())); break; case 1: y--; bt.Add(new Tuple<int, string>(1, t[y].ToString())); break; case 2: x--; y--; if (s[x] != t[y]) { bt.Add(new Tuple<int, string>(2, s[x] + "" + t[y])); } break; } } bt.Reverse(); return new Tuple<int, IList<Tuple<int, string>>>(d[s.Length, t.Length], bt); } public static int LevenshteinDistance(string s, string t) { int[,] d = new int[s.Length + 1, t.Length + 1]; for (int i = 0; i < s.Length + 1; i++) { d[i, 0] = i; } for (int j = 1; j < t.Length + 1; j++) { d[0, j] = j; } for (int i = 1; i <= s.Length; i++) { for (int j = 1; j <= t.Length; j++) { d[i, j] = (new int[] { d[i - 1, j] + 1, // del d[i, j - 1] + 1, // ins d[i - 1, j - 1] + (s[i - 1] == t[j - 1] ? 0 : 1) // sub }).Min(); } } return d[s.Length, t.Length]; } #endregion }  string[] words = null; long[] wordFrequency = null; #region read priors if (!File.Exists("../../../Data/words.bin") || !File.Exists("../../../Data/wordFrequency.bin")) { IDictionary<string, long> wf = new Dictionary<string, long>(); Console.Write("Reading data:"); using (StreamReader sr = new StreamReader("../../../Data/count_1w.txt")) { string line = sr.ReadLine(); while (line != null) { string[] parts = line.Split('\t'); wf.Add(parts[0].Trim(), Convert.ToInt64(parts[1])); line = sr.ReadLine(); Console.Write("."); } sr.Close(); } Console.WriteLine("Done!"); words = wf.Keys.ToArray(); wordFrequency = wf.Values.ToArray(); using (FileStream fs = File.Create("../../../Data/words.bin")) { BinaryFormatter bf = new BinaryFormatter(); bf.Serialize(fs, words); fs.Flush(); fs.Close(); } using (FileStream fs = File.Create("../../../Data/wordFrequency.bin")) { BinaryFormatter bf = new BinaryFormatter(); bf.Serialize(fs, wordFrequency); fs.Flush(); fs.Close(); } } else { using (FileStream fs = File.OpenRead("../../../Data/words.bin")) { BinaryFormatter bf = new BinaryFormatter(); words = bf.Deserialize(fs) as string[]; fs.Close(); } using (FileStream fs = File.OpenRead("../../../Data/wordFrequency.bin")) { BinaryFormatter bf = new BinaryFormatter(); wordFrequency = bf.Deserialize(fs) as long[]; fs.Close(); } } #endregion //del - 0, ins - 1, sub - 2 IDictionary<int, IDictionary<string, int>> mistakeFrequency = new Dictionary<int, IDictionary<string, int>>(); #region read mistakes IDictionary<string, IList<string>> misspelledWords = new SortedDictionary<string, IList<string>>(); using (StreamReader sr = new StreamReader("../../../Data/spell-errors.txt")) { string line = sr.ReadLine(); while (line != null) { string[] parts = line.Split(':'); string wt = parts[0].Trim(); misspelledWords.Add(wt, parts[1].Split(',').Select(w => w.Trim()).ToList()); line = sr.ReadLine(); } sr.Close(); } mistakeFrequency.Add(0, new Dictionary<string, int>()); mistakeFrequency.Add(1, new Dictionary<string, int>()); mistakeFrequency.Add(2, new Dictionary<string, int>()); foreach (string s in misspelledWords.Keys) { foreach (string t in misspelledWords[s]) { var d = NoisyChannel.LevenshteinDistanceWithBacktrace(s, t); foreach (Tuple<int, string> ml in d.Item2) { if (!mistakeFrequency[ml.Item1].ContainsKey(ml.Item2)) { mistakeFrequency[ml.Item1].Add(ml.Item2, 0); } mistakeFrequency[ml.Item1][ml.Item2]++; } } } #endregion  NoisyChannel nc = new NoisyChannel(words, wordFrequency, mistakeFrequency, 1); Stopwatch timer = new Stopwatch(); timer.Start(); IDictionary<string, double> c = nc.GetCandidates("he;;o", 2); timer.Stop(); TimeSpan ts = timer.Elapsed; Console.WriteLine(ts.ToString()); 
Source: https://habr.com/ru/post/202908/
All Articles