// - CP1251 $string = iconv('UTF-8', 'Windows-1251', ' , , , .'); // , md_detect_encoding(). $strict = FALSE var_dump(mb_detect_encoding($string, array('UTF-8'))); // UTF-8 var_dump(mb_detect_encoding($string, array('UTF-8', 'Windows-1251'))); // Windows-1251 var_dump(mb_detect_encoding($string, array('UTF-8', 'KOI8-R'))); // KOI8-R var_dump(mb_detect_encoding($string, array('UTF-8', 'Windows-1251', 'KOI8-R'))); // FALSE var_dump(mb_detect_encoding($string, array('UTF-8', 'ISO-8859-5'))); // ISO-8859-5 var_dump(mb_detect_encoding($string, array('UTF-8', 'Windows-1251', 'KOI8-R', 'ISO-8859-5'))); // ISO-8859-5 // $strict = TRUE var_dump(mb_detect_encoding($string, array('UTF-8'), TRUE)); // FALSE var_dump(mb_detect_encoding($string, array('UTF-8', 'Windows-1251'), TRUE)); // FALSE var_dump(mb_detect_encoding($string, array('UTF-8', 'KOI8-R'), TRUE)); // FALSE var_dump(mb_detect_encoding($string, array('UTF-8', 'Windows-1251', 'KOI8-R'), TRUE)); // FALSE var_dump(mb_detect_encoding($string, array('UTF-8', 'ISO-8859-5'), TRUE)); // ISO-8859-5 var_dump(mb_detect_encoding($string, array('UTF-8', 'Windows-1251', 'KOI8-R', 'ISO-8859-5'), TRUE)); // ISO-8859-5
// ext/mbstring/mbstring.c:2629 PHP_FUNCTION(mb_detect_encoding) { ... // 2703 ret = mbfl_identify_encoding_name(&string, elist, size, strict); ...
// ext/mbstring/libmbfl/mbfl/mbfilter.c:643 const char* mbfl_identify_encoding_name(mbfl_string *string, enum mbfl_no_encoding *elist, int elistsz, int strict) { const mbfl_encoding *encoding; encoding = mbfl_identify_encoding(string, elist, elistsz, strict); ...
// ext/mbstring/libmbfl/mbfl/mbfilter.c:557 /* * identify encoding */ const mbfl_encoding * mbfl_identify_encoding(mbfl_string *string, enum mbfl_no_encoding *elist, int elistsz, int strict) { ...
// ext/mbstring/libmbfl/mbfl/mbfilter.c:593 (*filter->filter_function)(*p, filter); if (filter->flag) { bad++; }
// ext/mbstring/libmbfl/filters/mbfilter_cp1251.c:142 /* all of this is so ugly now! */ static int mbfl_filt_ident_cp1251(int c, mbfl_identify_filter *filter) { if (c >= 0x80 && c < 0xff) filter->flag = 0; else filter->flag = 1; /* not it */ return c; }
// ext/mbstring/libmbfl/filters/mbfilter_koi8r.c:142 static int mbfl_filt_ident_koi8r(int c, mbfl_identify_filter *filter) { if (c >= 0x80 && c < 0xff) filter->flag = 0; else filter->flag = 1; /* not it */ return c; }
// ext/mbstring/libmbfl/mbfl/mbfl_ident.c:248 int mbfl_filt_ident_true(int c, mbfl_identify_filter *filter) { return c; }
array ( '' => 0.095249209893009, '' => 0.06836817536026, '' => 0.067481298384992, '' => 0.055995027400041, '' => 0.052242744063325, .... '' => 0.002252892226507, '' => 0.0021318391371162, '' => 0.0018574762967903, '' => 0.0015961610948418, '' => 0.0014044332975731, '' => 0.0013188987793209, '' => 0.0012623590130186, '' => 0.0011804488387602, '' => 0.001061932790165, )
array ( '' => 0.095249209893009, '' => 0.095249209893009, '' => 0.06836817536026, '' => 0.06836817536026, '' => 0.067481298384992, '' => 0.067481298384992, '' => 0.055995027400041, '' => 0.055995027400041, .... '' => 0.0029893589260344, '' => 0.0029893589260344, '' => 0.0024649163501406, '' => 0.0024649163501406, '' => 0.002252892226507, '' => 0.002252892226507, '' => 0.0015961610948418, '' => 0.0015961610948418, )
$encodings = array( 'cp1251' => require 'specter_cp1251.php', 'koi8r' => require 'specter_koi8r.php', 'iso88595' => require 'specter_iso88595.php' ); $enc_rates = array(); for ($i = 0; $i < len($str); ++$i) { foreach ($encodings as $encoding => $char_specter) { $enc_rates[$encoding] += $char_specter[ord($str[$i])]; } } var_dump($enc_rates);
cp1251 | koi8r | iso88595 |
0.441 | 0.020 | 0.085 | Windows-1251
0.049 | 0.441 | 0.166 | KOI8-R
0.133 | 0.092 | 0.441 | ISO-8859-5
cp1251 | koi8r | iso88595 |
0.013 | 0.705 | 0.331 | Windows-1251
0.649 | 0.013 | 0.201 | KOI8-R
0.007 | 0.392 | 0.013 | ISO-8859-5
cp1251 | koi8r | iso88595 |
0.477 | 0.342 | 0.085 | Windows-1251
0.315 | 0.477 | 0.207 | KOI8-R
0.216 | 0.321 | 0.477 | ISO-8859-5
cp1251 | koi8r | iso88595 |
1.074 | 0.705 | 0.465 | Windows-1251
0.649 | 1.074 | 0.201 | KOI8-R
0.331 | 0.392 | 1.074 | ISO-8859-5
$str_cp1251 = iconv('UTF-8', 'Windows-1251', ' '); var_dump(md5($str_cp1251)); var_dump(md5(iconv('Windows-1251', 'Windows-1251', $str_cp1251))); var_dump(md5(iconv('KOI8-R', 'KOI8-R', $str_cp1251))); var_dump(md5(iconv('ISO-8859-5', 'ISO-8859-5', $str_cp1251))); var_dump(md5(iconv('UTF-8', 'UTF-8', $str_cp1251)));
m00t@m00t:~/workspace/test$ php detect_encoding.php string(32) "96e14d7add82668414ffbc498fcf2a4e" string(32) "96e14d7add82668414ffbc498fcf2a4e" string(32) "96e14d7add82668414ffbc498fcf2a4e" string(32) "96e14d7add82668414ffbc498fcf2a4e" PHP Notice: iconv(): Detected an illegal character in input string in /home/m00t/workspace/test/detect_encoding.php on line 36 PHP Stack trace: PHP 1. {main}() /home/m00t/workspace/test/detect_encoding.php:0 PHP 2. iconv() /home/m00t/workspace/test/detect_encoding.php:36 string(32) "d41d8cd98f00b204e9800998ecf8427e"
Source: https://habr.com/ru/post/107945/
All Articles