<meta http-equiv="content-type" content="text/html; charset=utf-8" />
use strict; use warnings; use HTML::TokeParser; use Data::Dumper; open (my $f,"<", $ARGV[0]) ; my $p = HTML::TokeParser->new($f); while (my $token = $p->get_token()) { print Dumper ($token); }
$VAR1 = [ 'T', ' ', '' ]; $VAR1 = [ 'D', '<!DOCTYPE html>' ]; $VAR1 = [ 'T', ' ', '' ]; $VAR1 = [ 'S', 'html', { 'xmlns' => 'http://www.w3.org/1999/xhtml', 'xml:lang' => 'ru' }, [ 'xmlns', 'xml:lang' ], '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ru">' ];
<a href="http://habrahabr.ru/post/163525/#habracut" class="button habracut"> →</a>
use strict; use warnings; use habr_parse; use LWP::UserAgent; use Data::Dumper; my $ua = LWP::UserAgent->new(); my $res = $ua->get("http://habrahabr.ru"); if ($res->is_success()) { my $parser = habr_parse->new(); # print Dumper ($res); my $conf = {}; $conf->{content} = $res->content; $conf->{cp} = 'utf8'; my $r = $parser->get_page_links($conf); print Dumper ($r); }
package habr_parse; use strict; use warnings; use HTML::TokeParser; use HTML::Entities; use Data::Dumper; use Encode; sub new { my $class = shift; my $self = {}; bless ($self, $class); } sub get_page_links { my $self = shift; my $conf = shift; my @data; # get internal format $conf->{content} = decode($conf->{cp},$conf->{content}); # print Dumper ($conf); decode_entities($conf->{content}); my $p = HTML::TokeParser->new(\$conf->{content}); while (my $token = $p->get_token()) { # we found our link if ($token->[0] eq 'S' && $token->[1] eq 'a' && defined ($token->[2]->{class}) && $token->[2]->{class}=~/^\s*button\s+habracut$/i) { push @data, $token->[2]->{href}; } } # print Dumper ($p); return \@data; } return 1;
if ($token->[0] eq 'S' && $token->[1] eq 'a' && defined ($token->[2]->{class}) && $token->[2]->{class}=~/^\s*button\s+habracut$/i)
$VAR1 = [ 'S', 'a', { 'href' => 'http://habrahabr.ru/hub/photo/', 'title' => ' ', 'class' => 'hub ' }, [ 'href', 'class', 'title' ], '<a href="http://habrahabr.ru/hub/photo/" class="hub " title=" " >' ]; $VAR1 = [ 'T', '', '' ];
$VAR1 = [ 'T', '. ', '' ]; $VAR1 = [ 'E', 'a', '</a>' ];
package habr_parse; use strict; use warnings; use HTML::TokeParser; use HTML::Entities; use Data::Dumper; use Encode; sub new { my $class = shift; my $self = {}; bless ($self, $class); } sub get_page_links { my $self = shift; my $conf = shift; my @data; # get internal format # $conf->{content} = decode($conf->{cp},$conf->{content}); # print Dumper ($conf); # decode_entities($conf->{content}); my $p = HTML::TokeParser->new(\$conf->{content}); my $tmp_conf = {}; while (my $token = $p->get_token()) { # we found our link if ($token->[0] eq 'S' && $token->[1] eq 'a' && defined ($token->[2]->{class}) && $token->[2]->{class}=~/^\s*button\s+habracut$/i) { $tmp_conf->{href} = $token->[2]->{href}; } elsif ($token->[0] eq 'S' && $token->[1] eq 'div' && defined ($token->[2]->{class}) && $token->[2]->{class} eq 'hubs') { my @next; my $found=0; # $tmp_conf = {}; my $token = $p->get_token(); push @next, $token; # div ( div ). while ($next[$#next][1] ne 'div') { push @next, $p->get_token(); # print Dumper ($next[$#next][1]); # if ($next[$#next][0] eq 'E' && $next[$#next][1] eq 'a') { # T if ($next[$#next-1][0] eq 'T') { # print $next[$#next-1][1] . "\n"; push @{$tmp_conf->{cats}}, $next[$#next-1][1]; $found = 1; } } } if (!$found) { # $p->unget_token(@next); } push @data, $tmp_conf; } } # print Dumper ($p); return \@data; } return 1;
$VAR1 = [ { 'cats' => [ ' IT', ' ' ], 'href' => 'http://habrahabr.ru/post/162053/#habracut' }, { 'cats' => [ '', ' ' ], 'href' => 'http://habrahabr.ru/post/163433/#habracut' }, { 'cats' => [ ' ', '. ', ' ' ], 'href' => 'http://habrahabr.ru/post/163493/#habracut' }, { 'cats' => [ 'HTML', 'CSS' ], 'href' => 'http://habrahabr.ru/post/163429/#habracut' }, { 'cats' => [ '', ' Intel' ], 'href' => 'http://habrahabr.ru/company/intel/blog/162293/#habracut' }, { 'cats' => [ ' — ', '', ' ' ], 'href' => 'http://habrahabr.ru/company/tm/blog/163483/#habracut' }, { 'cats' => [ '-', 'Open source' ], 'href' => 'http://habrahabr.ru/post/163425/#habracut' }, { 'cats' => [ '', ' ', 'Open source' ], 'href' => 'http://habrahabr.ru/post/148911/#habracut' }, { 'cats' => [ '' ], 'href' => 'http://habrahabr.ru/post/163445/#habracut' }, { 'cats' => [ ' ', ' ' ], 'href' => 'http://habrahabr.ru/post/163525/#habracut' } ];
Source: https://habr.com/ru/post/163567/
All Articles