s a l e f o r s a l e f o r s a l e
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
s 0 + . . . . . . . . + . . . . . . . . + . . .
a 1 . + . . . . . . . . + . . . . . . . . + . .
l 2 . . + . . . . . . . . + . . . . . . . . + .
e 3 . . . + . . . . . . . . + . . . . . . . . +
4 . . . . + . . . + . . . . + . . . + . . . .
f 5 . . . . . + . . . . . . . . + . . . . . . .
o 6 . . . . . . + . . . . . . . . + . . . . . .
r 7 . . . . . . . + . . . . . . . . + . . . . .
8 . . . . . . . . + . . . . + . . . + . . . .
s 9 . . . . . . . . . + . . . . . . . . + . . .
a 10 . . . . . . . . . . + . . . . . . . . + . .
l 11 . . . . . . . . . . . + . . . . . . . . + .
e 12 . . . . . . . . . . . . + . . . . . . . . +
13 . . . . . . . . . . . . . + . . . + . . . .
f 14 . . . . . . . . . . . . . . + . . . . . . .
o 15 . . . . . . . . . . . . . . . + . . . . . .
r 16 . . . . . . . . . . . . . . . . + . . . . .
17 . . . . . . . . . . . . . . . . . + . . . .
s 18 . . . . . . . . . . . . . . . . . . + . . .
a 19 . . . . . . . . . . . . . . . . . . . + . .
l 20 . . . . . . . . . . . . . . . . . . . . + .
e 21 . . . . . . . . . . . . . . . . . . . . . +
$VAR1 = {
'sale' => 3,
'for sale' => 2
};
#!/usr/bin/perl -w use strict; use utf8; use Data::Dumper; binmode(STDOUT, ':utf8'); my $min_longest_repeat_length = 4; my $message = 'sale for sale for sale'; my %longest_repeates = (); get_longest_repeates(\$message, \%longest_repeates); print Dumper(\%longest_repeates); sub get_longest_repeates { my $test_ref = shift; # my $reps_ref = shift; # my @symbols = split //, $$test_ref; my $m_len = scalar @symbols; my @matrix = (); # # for (my $i = 0; $i < $m_len; $i++) { # $matrix[$i] = []; for (my $j = $i; $j < $m_len; $j++) { # $matrix[$i][$j] = 1 if $symbols[$i] eq $symbols[$j]; } } # my %repeats_tmp = (); # my ($i, $j); # , .. for ($i = $m_len - 1; $i > 0; $i--) { my $repeat = ''; my $repeat_pos = undef; my $repeat_temp; for ($j = $i; $j < $m_len; $j++) { if (defined($matrix[$j-$i][$j]) && $matrix[$j-$i][$j] == 1) { $repeat_temp = $repeat; $repeat_temp =~ s/^ //; # if (defined($repeats_tmp{$repeat_temp})) { $repeat_pos = $j - length($repeat_temp); $repeats_tmp{$repeat_temp}{$repeat_pos} = 1; $repeat = $symbols[$j]; } else { $repeat .= $symbols[$j]; } } else { if ($repeat ne '') { $repeat =~ s/^ //; $repeat_pos = $j - length($repeat); if (length($repeat) >= $min_longest_repeat_length) { if (defined($repeats_tmp{$repeat})) { $repeats_tmp{$repeat}{$repeat_pos} = 1; } else { $repeats_tmp{$repeat} = {$repeat_pos => 1}; } } $repeat = ''; } } } if ($repeat ne '') { $repeat =~ s/^ //; $repeat_pos = $j - length($repeat); if (length($repeat) >= $min_longest_repeat_length) { if (defined($repeats_tmp{$repeat})) { $repeats_tmp{$repeat}{$repeat_pos} = 1; } else { $repeats_tmp{$repeat} = {$repeat_pos => 1}; } } $repeat = ''; } } foreach (keys %repeats_tmp){ $$reps_ref{$_} = 1 + scalar keys %{$repeats_tmp{$_}}; } # print "\n"; print ' '; for (my $i = 0; $i < $m_len; $i++) { print ' ' . $symbols[$i]; } print "\n"; print ' '; for (my $i = 0; $i < $m_len; $i++) { printf '%3d', $i; } print "\n"; print "\n"; for (my $i = 0; $i < $m_len; $i++) { print $symbols[$i]; printf '%3d ', $i; for (my $j = 0; $j < $m_len; $j++) { my $value = '.'; $value = '+' if (defined $matrix[$i][$j] && $matrix[$i][$j] == 1); printf(' %1s', $value); } print "\n"; } print "\n"; }
Number of repetitions | In spam,% | Not spam,% |
---|---|---|
2 | 78.58 | 90.28 |
3 | 11.93 | 4.86 |
four | 4.45 | 2.08 |
five | 2.30 | 1.39 |
6 | 1.93 | 0 |
7 | 0.22 | 0 |
eight | 0.37 | 0 |
9 | 0.07 | 0 |
Source: https://habr.com/ru/post/301302/
All Articles