[1007] | 1 | #!/usr/bin/perl |
---|
| 2 | |
---|
| 3 | use strict; |
---|
| 4 | use warnings; |
---|
| 5 | use utf8; |
---|
| 6 | #use bigint; |
---|
| 7 | use List::MoreUtils qw(uniq); |
---|
| 8 | use HTML::TreeBuilder; |
---|
[1109] | 9 | use HTML::Entities; |
---|
[1007] | 10 | use Data::Dumper; |
---|
| 11 | use Getopt::Long; |
---|
| 12 | use UNIVERSAL 'isa'; |
---|
| 13 | |
---|
[1043] | 14 | my $output_dir = '../html/bdm_doc'; |
---|
| 15 | my $input_dir = '../../../../library/doc/html/'; |
---|
[1007] | 16 | my $filename = 'annotated.html'; |
---|
[1109] | 17 | #my @classes = map {encode_entities($_, '<>&')} @ARGV; |
---|
[1007] | 18 | my @classes = @ARGV; |
---|
| 19 | ;;;print STDERR Dumper \@classes; |
---|
| 20 | |
---|
| 21 | GetOptions( |
---|
| 22 | 'filename=s' => \$filename, |
---|
| 23 | 'output-dir=s' => \$output_dir, |
---|
| 24 | 'input-dir=s' => \$input_dir, |
---|
| 25 | ) or die('Wrong options format'); |
---|
| 26 | |
---|
| 27 | my $out_filename = $output_dir . '/' . $filename; |
---|
| 28 | open (my $OUT, ">$out_filename") |
---|
| 29 | or die "cannot open $filename: $!"; |
---|
| 30 | $filename = $input_dir . '/' . $filename; |
---|
| 31 | |
---|
| 32 | print STDERR "Processing $filename\n"; |
---|
| 33 | |
---|
| 34 | my $html = HTML::TreeBuilder->new(); |
---|
[1039] | 35 | $html->store_comments(1); |
---|
[1007] | 36 | $html->parse_file($filename); |
---|
| 37 | my $body = $html->content_array_ref()->[1]; |
---|
| 38 | |
---|
| 39 | my $i = 0; |
---|
| 40 | my $div_content; |
---|
| 41 | while ($div_content = $body->content_array_ref()->[$i++]) { |
---|
| 42 | last if (defined $div_content->attr('class')) |
---|
| 43 | && ($div_content->attr('class') eq 'contents'); |
---|
| 44 | } |
---|
| 45 | |
---|
| 46 | $i = 0; |
---|
| 47 | my $table; |
---|
| 48 | while ($table = $div_content->content_array_ref()->[$i++]) { |
---|
| 49 | last if isa($table, "HTML::Element") && ( $table->tag() eq 'table' ); |
---|
| 50 | } |
---|
| 51 | |
---|
| 52 | $i = 0; |
---|
[1039] | 53 | my @to_delete; |
---|
[1007] | 54 | while (my $node = $table->content_array_ref()->[$i++]) { |
---|
| 55 | #;;print_content($node); |
---|
| 56 | my $class = $node->content_array_ref()->[0] #td |
---|
| 57 | ->content_array_ref()->[0] #a |
---|
| 58 | ->content_array_ref()->[0]; #bdm::ARX |
---|
| 59 | |
---|
[1039] | 60 | #;;map {print STDERR " $_\n"} grep {$_ eq $class} @classes; |
---|
| 61 | #;;print STDERR "$class\n"; |
---|
[1007] | 62 | |
---|
[1109] | 63 | $class =~ s/ /_/g; |
---|
| 64 | |
---|
[1039] | 65 | if (grep {$_ eq $class} @classes) { |
---|
[1109] | 66 | #;;print "$class is in @classes\n\n"; |
---|
[1039] | 67 | next; |
---|
| 68 | }; |
---|
[1007] | 69 | |
---|
[1109] | 70 | ;;;print STDERR "$class will be detached\n\n"; |
---|
[1039] | 71 | #;;print STDERR $node->tag() . ' ' . $node->{_parent}->tag() . "\n"; |
---|
[1007] | 72 | |
---|
[1039] | 73 | push @to_delete, $node; |
---|
[1007] | 74 | } |
---|
| 75 | |
---|
[1039] | 76 | $_->delete() for @to_delete; |
---|
| 77 | |
---|
| 78 | #;;print_content($table); |
---|
| 79 | |
---|
[1007] | 80 | #print the result |
---|
| 81 | print $OUT $html->as_HTML(); |
---|
| 82 | |
---|
| 83 | $html->delete; |
---|
| 84 | |
---|
| 85 | #============================================================================= |
---|
| 86 | |
---|
| 87 | |
---|
| 88 | # depth-first search |
---|
| 89 | sub dfs { |
---|
| 90 | my $node = shift or return; |
---|
| 91 | return if !ref $node; # do not process text nodes |
---|
| 92 | |
---|
| 93 | my $function = shift or return; |
---|
| 94 | my $data = shift || {}; |
---|
| 95 | my $depth = shift || 0; |
---|
| 96 | my $limit = shift || 10000; #inf; |
---|
| 97 | return if $depth > $limit; |
---|
| 98 | |
---|
| 99 | $function->($node, $data, $depth, $limit); |
---|
| 100 | |
---|
| 101 | for ($node->content_list()) { |
---|
| 102 | dfs($_, $function, $data, $depth+1, $limit); |
---|
| 103 | } |
---|
| 104 | } |
---|
| 105 | |
---|
| 106 | # print html subtree -- just for debugging |
---|
| 107 | sub print_content { |
---|
| 108 | my ($node, $depth, $limit) = @_; |
---|
| 109 | my $print_tag = sub { |
---|
| 110 | my ($node, undef, $depth) = @_; |
---|
| 111 | print STDERR " " x $depth, $node->tag(), "\n"; |
---|
| 112 | #;;print $node->as_HTML() if $node->tag() eq '~comment'; |
---|
| 113 | }; |
---|
| 114 | dfs($node, $print_tag, {}, $depth, $limit); |
---|
| 115 | } |
---|