#!/usr/bin/perl -w $VERSION = '0.4'; #------------------------------------------------------------------------------ # # Pod # #------------------------------------------------------------------------------ =head1 NAME html2xml.pl - script for generating formatted XML from HTML =head1 SYNOPSIS html2xml.pl cat | html2xml.pl =head1 DESCRIPTION This script was made to clean HTML documents in order to put data included in a XML native database. Generated XML elements are :
can be the BODY element or a DIV element As everything, it's not a perfect script , so i will be pleased if you mail me bug you find. Ce script est fait pour extraire les données "utiles" d'un document HTML, et les sauvegardes dans un document XML dont les éléments sont :
comporte l'élément BODY ou les DIV du document HTML Ce script n'est pas parfait et il est donc fort possible que vous en repériez un disfonctionnement. Je serai ravi que vous m'en faisiez part dans un courriel. =head1 PREREQUISITES HTML::TreeBuilder Encode (included in Perl 5.8) =head1 OSNAMES any =head1 AUTHOR Francois Colombier Efrancois.colombier@free.fr =head1 COPYRIGHT This script is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SCRIPT CATEGORIES Web =cut #------------------------------------------------------------------------------ # # End of pod # #------------------------------------------------------------------------------ use strict; require 5.004; use HTML::TreeBuilder; use Encode; #------------------------------------------------------------------------------ # # Public global varables # #------------------------------------------------------------------------------ use vars qw( $html_tree %equiv ); #------------------------------------------------------------------------------ # # set autoflushing # #------------------------------------------------------------------------------ $|++; #------------------------------------------------------------------------------ # # BEGIN block - create global objects # #------------------------------------------------------------------------------ BEGIN { $html_tree = new HTML::TreeBuilder; %equiv = ('&'=>"&", '<'=>"<", '>'=>">" ); } #------------------------------------------------------------------------------ # # get_divs - routine for generating an array of divs from a given node # #------------------------------------------------------------------------------ sub get_divs { my $this = shift; # array to save divs in my @divs = (); # iterate though my children ... foreach my $node_ref ($this->content_refs_list) { if(ref $$node_ref) { my $tag = $$node_ref->tag; if ( $tag =~ /div/i ) { my @contenu=get_divs( $$node_ref ); push @divs,"
"; if(@contenu) { push @divs,@contenu; push @divs,"
"; } else { pop @divs; } } elsif ($tag =~ /^table$/i ) { push @divs,"
"; push @divs,get_tables($$node_ref ); push @divs,"
"; } elsif ($tag =~ /^(ol|ul|dl)$/i ) { push @divs,""; push @divs,get_paragraphs($$node_ref ); push @divs,""; } elsif ($tag =~ /^p$/i ) { my @contenu=get_paragraphs($$node_ref ); push @divs,"

"; if(@contenu) { if(@contenu != 1 || (@contenu == 1 && $contenu[1] !~ //)) { push @divs,@contenu; push @divs,"

"; } else { pop @divs; } } else { pop @divs; } } elsif($tag !~ /^script$/i) { push @divs,get_divs($$node_ref ) unless $tag =~ /