1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | use strict; |
---|
4 | use warnings; |
---|
5 | use utf8; |
---|
6 | #use bigint; |
---|
7 | use List::MoreUtils qw(uniq); |
---|
8 | use HTML::TreeBuilder; |
---|
9 | use Data::Dumper; |
---|
10 | use Getopt::Long; |
---|
11 | use UNIVERSAL 'isa'; |
---|
12 | |
---|
13 | my $output_dir = '../toolbox_html'; |
---|
14 | my $input_dir = '../html'; |
---|
15 | my $filename = 'annotated.html'; |
---|
16 | my @classes = @ARGV; |
---|
17 | ;;;print STDERR Dumper \@classes; |
---|
18 | |
---|
19 | GetOptions( |
---|
20 | 'filename=s' => \$filename, |
---|
21 | 'output-dir=s' => \$output_dir, |
---|
22 | 'input-dir=s' => \$input_dir, |
---|
23 | ) or die('Wrong options format'); |
---|
24 | |
---|
25 | my $out_filename = $output_dir . '/' . $filename; |
---|
26 | open (my $OUT, ">$out_filename") |
---|
27 | or die "cannot open $filename: $!"; |
---|
28 | $filename = $input_dir . '/' . $filename; |
---|
29 | |
---|
30 | print STDERR "Processing $filename\n"; |
---|
31 | |
---|
32 | my $html = HTML::TreeBuilder->new(); |
---|
33 | $html->parse_file($filename); |
---|
34 | my $body = $html->content_array_ref()->[1]; |
---|
35 | |
---|
36 | my $i = 0; |
---|
37 | my $div_content; |
---|
38 | while ($div_content = $body->content_array_ref()->[$i++]) { |
---|
39 | last if (defined $div_content->attr('class')) |
---|
40 | && ($div_content->attr('class') eq 'contents'); |
---|
41 | } |
---|
42 | |
---|
43 | $i = 0; |
---|
44 | my $table; |
---|
45 | while ($table = $div_content->content_array_ref()->[$i++]) { |
---|
46 | last if isa($table, "HTML::Element") && ( $table->tag() eq 'table' ); |
---|
47 | } |
---|
48 | |
---|
49 | $i = 0; |
---|
50 | while (my $node = $table->content_array_ref()->[$i++]) { |
---|
51 | #;;print_content($node); |
---|
52 | my $class = $node->content_array_ref()->[0] #td |
---|
53 | ->content_array_ref()->[0] #a |
---|
54 | ->content_array_ref()->[0]; #bdm::ARX |
---|
55 | ;;;print STDERR "$class\n"; |
---|
56 | |
---|
57 | ;;;map {print STDERR " $_\n"} grep {$_ eq $class} @classes; |
---|
58 | |
---|
59 | next if grep {$_ eq $class} @classes; |
---|
60 | |
---|
61 | ;;;print STDERR "Hey! I passed after the grep!\n"; |
---|
62 | |
---|
63 | $node->detach(); |
---|
64 | } |
---|
65 | |
---|
66 | #print the result |
---|
67 | print $OUT $html->as_HTML(); |
---|
68 | |
---|
69 | $html->delete; |
---|
70 | |
---|
71 | #============================================================================= |
---|
72 | |
---|
73 | |
---|
74 | # depth-first search |
---|
75 | sub dfs { |
---|
76 | my $node = shift or return; |
---|
77 | return if !ref $node; # do not process text nodes |
---|
78 | |
---|
79 | my $function = shift or return; |
---|
80 | my $data = shift || {}; |
---|
81 | my $depth = shift || 0; |
---|
82 | my $limit = shift || 10000; #inf; |
---|
83 | return if $depth > $limit; |
---|
84 | |
---|
85 | $function->($node, $data, $depth, $limit); |
---|
86 | |
---|
87 | for ($node->content_list()) { |
---|
88 | dfs($_, $function, $data, $depth+1, $limit); |
---|
89 | } |
---|
90 | } |
---|
91 | |
---|
92 | # print html subtree -- just for debugging |
---|
93 | sub print_content { |
---|
94 | my ($node, $depth, $limit) = @_; |
---|
95 | my $print_tag = sub { |
---|
96 | my ($node, undef, $depth) = @_; |
---|
97 | print STDERR " " x $depth, $node->tag(), "\n"; |
---|
98 | #;;print $node->as_HTML() if $node->tag() eq '~comment'; |
---|
99 | }; |
---|
100 | dfs($node, $print_tag, {}, $depth, $limit); |
---|
101 | } |
---|