📜 ⬆️ ⬇️

We define the "wrong" words in the fight against spam

When dealing with spam on the forum, the idea arose to automatically catch words that are externally similar to “normal”, but in fact differ from the template, available in the database of stop words. This is done by replacing the Cyrillic symbol with the Latin alphabet and vice versa. For example, “We sell beta” and “Sell concrete” only look the same in appearance, but in fact they differ from each other.
So I wrote a small function to reduce the entropy of the Universe, which determines (if it can) the language in which the word is written, and replaces it with a normal one. Then we check this word according to the list of stop words and decide whether it is prohibited or not :)

(Cyrillic is marked with red, Latin or numbers with blue)

I will not go into the details of the C # code, I think everything will be clear.
Constructive suggestions and comments will be happy :)

public string CheckWord( string word, out string lang, out bool Changes)
{
lang = "?" ;
string newword = word;
string OnlyRu = "" ;<br>
string OnlyEn = "DdFfGghIiJjLlNQqRrSstUVvWwYZz" ;
string Rus = "1@" ;
string Eng = "AaBEeKkMHOoPpCcTyXx30imu@anb" ;

bool IsRu100percent = false ;
foreach ( char c1 in word)
foreach ( char c2 in OnlyRu)
IsRu100percent = IsRu100percent || (c1 == c2);

if (IsRu100percent)
{
lang = "ru" ;

//
for ( int i = 0; i < word.Length; i++)
if (Eng.IndexOf(word[i]) >= 0)
{
// word[i] ""
}
for ( int i = 0; i < Rus.Length; i++)
newword = newword.Replace(Eng[i], Rus[i]);
}
else
{
bool IsEn100percent = false ;
foreach ( char c1 in word)
foreach ( char c2 in OnlyEn)
IsEn100percent = IsEn100percent || (c1 == c2);
if (IsEn100percent)
{
lang = "en" ;
//
for ( int i = 0; i < word.Length; i++)
if (Rus.IndexOf(word[i]) >= 0)
{
// word[i] ""
}

for ( int i = 0; i < Eng.Length; i++)
newword = newword.Replace(Rus[i], Eng[i]);

}
}
// ?
Changes = newword != word;
newword = newword.ToLower();
_SelectionColor = lang == "ru" ? Color.Red : lang == "en" ? Color.Blue : Color.Black;
return newword;
}

* This source code was highlighted with Source Code Highlighter .
public string CheckWord( string word, out string lang, out bool Changes)
{
lang = "?" ;
string newword = word;
string OnlyRu = "" ;<br>
string OnlyEn = "DdFfGghIiJjLlNQqRrSstUVvWwYZz" ;
string Rus = "1@" ;
string Eng = "AaBEeKkMHOoPpCcTyXx30imu@anb" ;

bool IsRu100percent = false ;
foreach ( char c1 in word)
foreach ( char c2 in OnlyRu)
IsRu100percent = IsRu100percent || (c1 == c2);

if (IsRu100percent)
{
lang = "ru" ;

//
for ( int i = 0; i < word.Length; i++)
if (Eng.IndexOf(word[i]) >= 0)
{
// word[i] ""
}
for ( int i = 0; i < Rus.Length; i++)
newword = newword.Replace(Eng[i], Rus[i]);
}
else
{
bool IsEn100percent = false ;
foreach ( char c1 in word)
foreach ( char c2 in OnlyEn)
IsEn100percent = IsEn100percent || (c1 == c2);
if (IsEn100percent)
{
lang = "en" ;
//
for ( int i = 0; i < word.Length; i++)
if (Rus.IndexOf(word[i]) >= 0)
{
// word[i] ""
}

for ( int i = 0; i < Eng.Length; i++)
newword = newword.Replace(Rus[i], Eng[i]);

}
}
// ?
Changes = newword != word;
newword = newword.ToLower();
_SelectionColor = lang == "ru" ? Color.Red : lang == "en" ? Color.Blue : Color.Black;
return newword;
}

* This source code was highlighted with Source Code Highlighter .
public string CheckWord( string word, out string lang, out bool Changes)
{
lang = "?" ;
string newword = word;
string OnlyRu = "" ;<br>
string OnlyEn = "DdFfGghIiJjLlNQqRrSstUVvWwYZz" ;
string Rus = "1@" ;
string Eng = "AaBEeKkMHOoPpCcTyXx30imu@anb" ;

bool IsRu100percent = false ;
foreach ( char c1 in word)
foreach ( char c2 in OnlyRu)
IsRu100percent = IsRu100percent || (c1 == c2);

if (IsRu100percent)
{
lang = "ru" ;

//
for ( int i = 0; i < word.Length; i++)
if (Eng.IndexOf(word[i]) >= 0)
{
// word[i] ""
}
for ( int i = 0; i < Rus.Length; i++)
newword = newword.Replace(Eng[i], Rus[i]);
}
else
{
bool IsEn100percent = false ;
foreach ( char c1 in word)
foreach ( char c2 in OnlyEn)
IsEn100percent = IsEn100percent || (c1 == c2);
if (IsEn100percent)
{
lang = "en" ;
//
for ( int i = 0; i < word.Length; i++)
if (Rus.IndexOf(word[i]) >= 0)
{
// word[i] ""
}

for ( int i = 0; i < Eng.Length; i++)
newword = newword.Replace(Rus[i], Eng[i]);

}
}
// ?
Changes = newword != word;
newword = newword.ToLower();
_SelectionColor = lang == "ru" ? Color.Red : lang == "en" ? Color.Blue : Color.Black;
return newword;
}

* This source code was highlighted with Source Code Highlighter .



The call is simple:
string lang;
bool changes;
string re = "[\\w\\@]+" ;
Regex rx = new Regex(re, RegexOptions.IgnoreCase | RegexOptions.Singleline);
Match m = rx.Match( "poae eo" );
while (m.Success)
{
string newWord = CheckWord(m.ToString(), out lang, out changes);
m = m.NextMatch();
}

* This source code was highlighted with Source Code Highlighter .
string lang;
bool changes;
string re = "[\\w\\@]+" ;
Regex rx = new Regex(re, RegexOptions.IgnoreCase | RegexOptions.Singleline);
Match m = rx.Match( "poae eo" );
while (m.Success)
{
string newWord = CheckWord(m.ToString(), out lang, out changes);
m = m.NextMatch();
}

* This source code was highlighted with Source Code Highlighter .

')

Source: https://habr.com/ru/post/86303/


All Articles