#!/usr/bin/perl use strict; use warnings; use utf8; #use bigint; use List::MoreUtils qw(uniq); use HTML::TreeBuilder; use Data::Dumper; use Getopt::Long; use UNIVERSAL 'isa'; my $output_dir = '../html/bdm_doc'; my $input_dir = '../../../../library/doc/html/'; my $filename = 'annotated.html'; my @classes = @ARGV; ;;;print STDERR Dumper \@classes; GetOptions( 'filename=s' => \$filename, 'output-dir=s' => \$output_dir, 'input-dir=s' => \$input_dir, ) or die('Wrong options format'); my $out_filename = $output_dir . '/' . $filename; open (my $OUT, ">$out_filename") or die "cannot open $filename: $!"; $filename = $input_dir . '/' . $filename; print STDERR "Processing $filename\n"; my $html = HTML::TreeBuilder->new(); $html->store_comments(1); $html->parse_file($filename); my $body = $html->content_array_ref()->[1]; my $i = 0; my $div_content; while ($div_content = $body->content_array_ref()->[$i++]) { last if (defined $div_content->attr('class')) && ($div_content->attr('class') eq 'contents'); } $i = 0; my $table; while ($table = $div_content->content_array_ref()->[$i++]) { last if isa($table, "HTML::Element") && ( $table->tag() eq 'table' ); } $i = 0; my @to_delete; while (my $node = $table->content_array_ref()->[$i++]) { #;;print_content($node); my $class = $node->content_array_ref()->[0] #td ->content_array_ref()->[0] #a ->content_array_ref()->[0]; #bdm::ARX #;;map {print STDERR " $_\n"} grep {$_ eq $class} @classes; #;;print STDERR "$class\n"; if (grep {$_ eq $class} @classes) { ;;;print "$class is in @classes\n"; next; }; #;;print STDERR "$class will be detached\n"; #;;print STDERR $node->tag() . ' ' . $node->{_parent}->tag() . "\n"; push @to_delete, $node; } $_->delete() for @to_delete; #;;print_content($table); #print the result print $OUT $html->as_HTML(); $html->delete; #============================================================================= # depth-first search sub dfs { my $node = shift or return; return if !ref $node; # do not process text nodes my $function = shift or return; my $data = shift || {}; my $depth = shift || 0; my $limit = shift || 10000; #inf; return if $depth > $limit; $function->($node, $data, $depth, $limit); for ($node->content_list()) { dfs($_, $function, $data, $depth+1, $limit); } } # print html subtree -- just for debugging sub print_content { my ($node, $depth, $limit) = @_; my $print_tag = sub { my ($node, undef, $depth) = @_; print STDERR " " x $depth, $node->tag(), "\n"; #;;print $node->as_HTML() if $node->tag() eq '~comment'; }; dfs($node, $print_tag, {}, $depth, $limit); }