#!/usr/bin/perl
#
# Copyright (C) 2004 Jrg Tiedemann  <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# $Id: readalign,v 1.6 2009/06/10 21:58:01 joerg72 Exp $
#
# usage: readalign [-m max] [-h] xces-file
#
#  -m max: show <max> number of sentence alignments
#  -h    : print html
#


use strict;
use FindBin qw($Bin);

my $html=0;
my $max=0;
my $dir='xml';                   # extra directory to check for from/toDoc
while ($ARGV[0]=~/^\-/){
    my $o=shift(@ARGV);
    if ($o=~/^\-h/){$html=1;}
    if ($o=~/^\-m/){$max=shift @ARGV;}
    if ($o=~/^\-d/){$dir=shift @ARGV;}
}

my $ALIGN=shift(@ARGV);

my $srcdoc='';
my $trgdoc='';

if ((not -e "$ALIGN") and (-e "$ALIGN.gz")){$ALIGN="$ALIGN.gz";}
if (not -e $ALIGN){die "Alignment file $ALIGN does not exist!\n";}

if ($ALIGN=~/\.gz/){
    open F,"gzip -cd <$ALIGN |";
}
else{
    open F,"<$ALIGN";
}

if ($html){&PrintHtmlHeader();}

my $firstSrc=1;
my $firstTrg=1;

my $count=0;
while (<F>){
    if (/fromDoc=\"([^\"]+)\"/){
	if ($srcdoc ne $1){
	    $srcdoc=$1;
	    if (not $firstSrc){close SRC;}
	    if ((not -e $srcdoc) and (-e "$srcdoc.gz")){
		$srcdoc="$srcdoc.gz";
	    }
	    if ((not -e $srcdoc) and (-e "$dir/$srcdoc")){
		$srcdoc="$dir/$srcdoc";
	    }
	    if ((not -e $srcdoc) and (-e "$dir/$srcdoc.gz")){
		$srcdoc="$dir/$srcdoc.gz";
	    }
	    if ($srcdoc=~/\.gz$/){
		open SRC,"gzip -cd <$srcdoc |";
	    }
	    else{
		open SRC,"<$srcdoc";
	    }
	    $firstSrc=0;
	}
    }
    if (/toDoc=\"([^\"]+)\"/){
	if ($trgdoc ne $1){
	    $trgdoc=$1;
	    if (not $firstTrg){close TRG;}
	    if ((not -e $trgdoc) and (-e "$trgdoc.gz")){
		$trgdoc="$trgdoc.gz";
	    }
	    if ((not -e $trgdoc) and (-e "$dir/$trgdoc")){
		$trgdoc="$dir/$trgdoc";
	    }
	    if ((not -e $trgdoc) and (-e "$dir/$trgdoc.gz")){
		$trgdoc="$dir/$trgdoc.gz";
	    }
	    if ($trgdoc=~/\.gz$/){
		open TRG,"gzip -cd <$trgdoc |";
	    }
	    else{
		open TRG,"<$trgdoc";
	    }
	    $firstTrg=0;
	    if ($html){print "<p>\n";}
	    print "\n# ".$srcdoc;
	    if ($html){print '<br>';}
	    print "\n# ".$trgdoc."\n\n";
	    if ($html){print "<p><hr>\n";}
	    else{print "================================\n";}
	}
    }
    if (/xtargets=\"([^\"]*)\s*\;\s*([^\"]*)\"/){
	my $srceof=1;
	my $trgeof=1;
	$count++;
	if ($max and ($count>$max)){last;}
	my $src=$1;
	my $trg=$2;
	my @srcsent=split(/\s/,$src);
	my @trgsent=split(/\s/,$trg);

	my $oldDel=$/;
	$/='</s>';
	foreach (@srcsent){
	    while (my $sent=<SRC>){
		$srceof=0;
		if ($sent=~/s [^\>]*id="$_"/s){
		    $sent=~s/^.*<s [^\>]*id/(src)/s;
		    $sent=~s/\n/ /gs;
		    $sent=~s/\<[^\>]*>//gs;
		    $sent=~s/  +/ /gs;
		    if ($html){$sent=&Str2Html($sent);}
		    else{
			$sent=~s/\&gt\;/\>/gs;
			$sent=~s/\&lt\;/\</gs;
			$sent=~s/\&amp\;/\&/gs;
		    }
		    print $sent;
		    if ($html){print "<br>";}
		    print "\n";
		    last;
		}
		$srceof=1;
	    }
	}

	foreach (@trgsent){
	    while (my $sent=<TRG>){
		$trgeof=0;
		if ($sent=~/s [^\>]*id="$_"/s){
		    $sent=~s/^.*<s [^\>]*id/(trg)/s;
		    $sent=~s/\n/ /gs;
		    $sent=~s/\<[^\>]*>//gs;
		    $sent=~s/  +/ /gs;
		    if ($html){$sent=&Str2Html($sent);}
		    else{
			$sent=~s/\&gt\;/\>/gs;
			$sent=~s/\&lt\;/\</gs;
			$sent=~s/\&amp\;/\&/gs;
		    }
		    print $sent;
		    if ($html){print "<br>";}
		    print "\n";
		    last;
		}
		$trgeof=1;
	    }
	}
        if ($trgeof){
            close TRG;
            if ($trgdoc=~/\.gz$/){open TRG,"gzip -cd <$trgdoc |";}
            else{open TRG,"<$trgdoc";}
        }
        if ($srceof){
            close SRC;
            if ($srcdoc=~/\.gz$/){open SRC,"gzip -cd <$srcdoc |";}
            else{open SRC,"<$srcdoc";}
        }
	$/=$oldDel;
	if ($html){print "<hr>\n";}
	else{print "================================\n";}
    }
}


if (not $firstSrc){close SRC;}
if (not $firstTrg){close TRG;}
close F;

if ($html){&PrintHtmlTail();}

sub Str2Html{
    my $string=shift;
#    $string=~s/\&/\&amp\;/gs;
#    $string=~s/\</\&lt\;/gs;
#    $string=~s/\>/\&gt\;/gs;
    return $string;
}

sub PrintHtmlHeader{
    print <<HEADER;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/loose.dtd"> 
<html>
<head>
<title>Untitled Document</title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
</head>
<body>
HEADER
}


sub PrintHtmlTail{
    print <<TAIL;
</body>
</html>
TAIL
}

