#!/usr/perl

# we need Perl 5.8.0+.
use 5.008;

use utf8;
use strict;

# make sure everything is output as UTF-8
binmode(STDOUT, ":utf8");

# program to generate a web page with Bidi IRI test examples
# Copyright W3C 2002 (Martin J. Duerst)

my $debug = '';

# general data definitions
# (name of variable is bidi character type as in TR9, value is
#  unique single lower-case letter)
# strong
my $L  = 'l';
my $LRE= 'v';
my $LRO= 'x';
my $R  = 'r';
my $AL = 'a';
my $RLE= 'y';
my $RLO= 'z';
# weak
my $PDF= 'p';
my $EN = 'e';
my $ES = 'd';
my $ET = 't';
my $AN = 'q';
my $CS = 'c';
my $NSM= 'm';
my $BN = 'u';
# neutral
my $B  = 'b';
my $S  = 's';
my $WS = 'w';
my $ON = 'o';
my $Strong = "$L$R$AL$LRE$LRO$RLE$RLO";
my $Weak   = "$EN$ES$ET$AN$CS$PDF$NSM$BN";
my $Neutral= "$B$S$WS$ON";


my $hebrew = "אבגדהוזחטיכלמןסעפץקרשת";
my $upper  = "ABCDEFGHIJKLMNOPQRSTUV";
my $arabic = "ابتثجحخدذرزسشصضطظعغفقك";

#
# logical to visual conversion
# (simplified version of the Unicode Bidi algorithm:
#  version 13 of TR 9, without embeddings/overrides,
#  only for a limited number of types, and only for
#  base direction LTR)
sub log2vis {
    my $input = shift; # input string
    my $type = shift;  # 'exact' or 'simulated'
    
    # use $_ as array of category letters, one letter for each character
    $_ = $input;
    # assign categories based on letters
    s/[a-z]/$L/g; # do this conversion first, to avoid
    	          # conflicts between actual letters and type
    if ($type eq 'exact') {
    	s/[A-Z]/$L/g;
    } elsif ($type eq 'hebrew') {
    	s/[A-Z]/$R/g;    
   	} elsif ($type eq 'arabic') {
   	    s/[A-Z]/$AL/g;
    } else {
    	die "Unknown second parameter to log2vis().\n";
    }
    s/[0-9]/$EN/g;
    s/[-+]/$ES/g;
    s/[#\$%]/$ET/g;
    s/[,.\/:]/$CS/g;
    s/[ ]/$WS/g;
    s/[!"&'()*;<=>?@\[\\\]^_`{|}~]/$ON/g;
    s/[\x{5D0}-\x{5EA}]/$R/g;   # Hebrew;
    s/[\x{621}-\x{64a}]/$AL/g;  # Arabic    
    print 'Start:  ', $_, "\n" if ($debug);
    
    # Apply bidi algorithm
    # (P1-P3: We have only one paragraph, and base directionality is LTR)

    # (X1-X9: We don't deal with embeddings/overrides)
    
    # X10: add sor/eor (both are L)
    s/^(.*)$/$L$1$L/;
    print 'X10:   ', $_, "\n" if ($debug);

    # W1: we don't deal with non-spacing marks yet, but anyway
    'continue' while s/(.)$NSM/$1$1/g;
    print 'W1:    ', $_, "\n" if ($debug);
    
    # W2: Search backwards from each instance of a European number until
    #     the first strong type (R, L, AL, or sor) is found. If an AL is found,
    #     change the type of the European number to Arabic number.
    'continue' while s/$AL([$Weak$Neutral]*)$EN/$AL$1$AN/g;
    print 'W2:    ', $_, "\n" if ($debug);

    # W3: change all ALs to R
    s/$AL/$R/g;
    print 'W3:    ', $_, "\n" if ($debug);

    # W4: A single European separator between two European numbers
    #     changes to a European number.
    'continue' while s/$EN$ES$EN/$EN$EN$EN/g;
    #     A single common separator between two numbers of the same type
    #     changes to that type
    'continue' while s/([$EN$AN])$CS\1/$1$1$1/g;
    print 'W4:    ', $_, "\n" if ($debug);

    # W5: ETs adjacent to ENs change to ENs
    'continue' while s/$ET$EN|$EN$ET/$EN$EN/g;
    print 'W5:    ', $_, "\n" if ($debug);

    # W6: separators and terminators change to ON
    s/[$ES$ET$CS]/$ON/g;
    print 'W6:    ', $_, "\n" if ($debug);

    # W7: Search backwards from each instance of a European number until
    #     the first strong type (R, L, or sor) is found. If an L is found,
    #     then change the type of the European number to L
        'continue' while s/$L([^$R]*)$EN/$L$1$L/g;
    print 'W7:    ', $_, "\n" if ($debug);

    # N1: A sequence of neutrals takes the direction of the surrounding
    #     strong text if the text on both sides has the same direction.
    #     European and Arabic numbers act as if they were R in terms of
    #     their influence on neutrals. Start-of-level-run (sor) and
    #     end-of-level-run (eor) are used at level run boundaries.
    
    
    ###neutrals take the direction of the surrounding strong text
    #     (EN and AN are treated as if they were R)
    'continue' while s/$L([$Neutral]*)[$Neutral]$L/$L$1$L$L/g;
    'continue' while s/([$R$AN$EN])([$Neutral]*)[$Neutral]([$R$AN$EN])/$1$2$R$3/g;
    print 'N1:    ', $_, "\n" if ($debug);

    # N2: Any remaining neutrals take the embedding direction (LTR)
    s/[$Neutral]/$L/g;
    print 'N2:    ', $_, "\n" if ($debug);
    
    # I1: for LTR base, R is 1, AN/EN is 2, rest is 0
    s/[$AN$EN]/2/g;
    s/$R/1/g;
    s/[a-z]/0/g;
    print 'I1:    ', $_, "\n" if ($debug);
    # (I2: we don't have odd embedding levels)
    
    # remove sor/eor
    s/^.(.*).$/$1/;

    my @input = split //, $input;
	# mirror
	my @levels = split //;
	my $i = 0;
	for my $c (@input) {
		if ($levels[$i++] % 2) {
		    $c =~ tr'()[]{}<>')(][}{><';
		}
	}
	
    # revert, starting with higher levels
    for my $level (reverse 1..2) {
        while (/($level*)/g) {
            my $p = length $`; # pre-match length
            my $m = length $&; # match length
        	splice @input, $p, $m, reverse @input[$p..($p+$m-1)];    
        }
        my $nextlevel = $level - 1;
        s/$level/$nextlevel/g;
    }    
    return join '', @input;
}


# main program starts here

my @commented_out = (
);
my @iris = (
    "http://ab.גדהוזח.ij/kl/mn/op.html",
    "http://ab.גדה.וזח/ij/kl/mn/op.html",
    "http://אב.גד.הו/זח/טי/כל?מן=סע;פץ=קר#שת",
    "http://אב.גד.ef/gh/טי/כל.html",
    "http://ab.cd.הו/זח/ij/kl.html",
    "http://ab.גד.הו/זח/טי/kl.html",
    "http://ab.גדה123וזח.ij/kl/mn/op.html",
    "http://ab.cd.ef/זח1/2טי/כל.html",
    "http://ab.cd.ef/זח%31/%32טי/כל.html",
    "http://ab.גדהוזח.123/kl/mn/op.html",
#   "http://אב.גד.הו/זח/טי/כל?מן=סע;פץ=קר#שת",
#   "http://אב.גד.הו/זח/טי/כל?מן=סע;פץ=קר#שת",
#   "http://ab.cd.הו/זח/ij/kl?מן=op;פץ=st#שת",
#   "http://אב.גד.ef/gh/טי/כל?מן=סע;פץ=קר#שת",
#   "http://אב.cd.ef/זח/טי/כל?מן=סע;פץ=קר#שת",
#   "http://אב.גד.הו/gh/ij/כל?מן=סע;פץ=קר#שת",
#   "http://אב.גד.הו/זח/טי/kl?mn=סע;פץ=קר#שת",
#   "http://אב.גד.הו/זח/טי/kl?מן=op;פץ=קר#שת",
#   "http://אב.גד.הו/זח/טי/כל?mn=סע;qr=קר#שת",
#   "http://אב1.גד2.הו3/זח4/טי5/כל6?מן7=סע8;פץ9=קר0#שת1",
);

my @spare = (
    "אבגדהו",
    "אבג#דהו",
    "אבג:דהו",
    "אבג?דהו",
    "אבג/דהו",
    "אבג\@דהו",
    "אבג;דהו",
    "אבג.דהו",
    "אבג&דהו"
);

my @convert = (
    "אבגדהוזחטיכלמןסעפץקרשת",
    "abcdefghijklmnopqrstuv",
    "ABCDEFGHIJKLMNOPQRSTUV",
    "ابتثجحخدذرزسشصضطظعغفقكلمنهوي"
);

my $allHebrew = "אבגדהוזחטיךכלםמןנסעףפץצקרשת";

print <<"EOStart";
<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang='en'>
  <head>
    <title>Examples of bidirectional IRIs</title>
  </head>
<body>
  <h1>Examples of bidirectional IRIs</h1>
  <p>Please view with a browser that does bidirectional rendering correctly!</p>
  <p>Please view with a browser that correctly uses nominal digit shapes!</p>
  <h2>Legend</h2>
  <p>LTR: left-to-right; RTL: right-to-left</p>
  <ul>
    <li style='color: black'>Logical Hebrew (displayed using LTR override)</li>
    <li style='color: green'>Visual Hebrew in LTR context</li>
    <li style='color: blue'>Visual Hebrew in calculated LTR context (displayed using LTR override;
              should match <span style='color: green'>green</span>)</li>
    <li style='color: green'>ASCII notation (upper case is Hebrew)</li>
    <li style='color: black'>ASCII notation (logical)</li>
    <li style='color: green'>ASCII notation (upper case is Arabic)</li>
    <li style='color: green'>Visual Arabic in LTR context</li>
    <li style='color: blue'>Visual Arabic in calculated LTR context (displayed using LTR override;
              should match <span style='color: green'>green</span>)</li>
    <li style='color: black'>Logical Arabic (displayed using LTR override)</li>
    <li style='color: red'>Visual Arabic in RTL context</li>
    <li style='color: red'>Visual Hebrew in RTL context</li>
    
  </ul>
EOStart

my $example = 0;
foreach my $iri (@iris) {
  $example++;
  my $iriVisual = log2vis ($iri, 'exact');
  my $iriASCII = $iri;
  $iriASCII =~ tr/אבגדהוזחטיכלמןסעפץקרשת/ABCDEFGHIJKLMNOPQRSTUV/;
  my $iriASCIIVisual = log2vis ($iriASCII, 'hebrew');
  my $iriArabic = $iri;
  $iriArabic =~ tr/אבגדהוזחטיכלמןסעפץקרשת/ابتثجحخدذرزسشصضطظعغفقك/;
  my $arabicVisual = log2vis ($iriArabic, 'exact');
  my $arabicASCIIVisual = log2vis ($iriASCII, 'arabic');
  
  print <<"EOItem";
  <h2>Example $example</h2>
  <ul style='font-family: monospace'>
    <li style='color: black'><bdo dir='ltr'>$iri</bdo></li>
    <li style='color: green' dir='ltr'>$iri</li>
    <li style='color: blue'><bdo dir='ltr'>$iriVisual</bdo></li>
    <li style='color: green'>$iriASCIIVisual</li>
    <li style='color: black'>$iriASCII</li>
    <li style='color: green'>$arabicASCIIVisual</li>
    <li style='color: green' dir='ltr'>$iriArabic</li>
    <li style='color: blue'><bdo dir='ltr'>$arabicVisual</bdo></li>
    <li style='color: black'><bdo dir='ltr'>$iriArabic</bdo></li>
    <li style='color: red'><span dir='rtl'>$iriArabic</span></li>
    <li style='color: red'><span dir='rtl'>$iri</span></li>
  </ul>
EOItem
}

print <<"EOEnd";
<hr />
  <div class="smallprint">
    <p>Version: \$Id\$</p>
    <p>Questions? <a href="mailto:duerst\@w3.org">duerst\@w3.org</a></p>
    <p><a href="http://www.w3.org/Consortium/Legal/ipr-notice-20000612.html#Copyright">Copyright</a> &nbsp; © 1997 - 2002
       <a href="http://www.w3.org/">W3C</a> ( <a href="http://www.lcs.mit.edu/">MIT</a> , <a href="http://www.inria.fr/">INRIA</a> ,
       <a href="http://www.keio.ac.jp/">Keio</a> ), All Rights Reserved. W3C
       <a href="http://www.w3.org/Consortium/Legal/ipr-notice-20000612.html#Legal_Disclaimer">liability,</a>
       <a href="http://www.w3.org/Consortium/Legal/ipr-notice-20000612.html#W3C_Trademarks">trademark</a> ,
       <a href="http://www.w3.org/Consortium/Legal/copyright-documents-19990405.html">document use</a> and
       <a href="http://www.w3.org/Consortium/Legal/copyright-software-19980720.html">software licensing</a> rules apply. Your interactions with this site
       are in accordance with our <a href="http://www.w3.org/Consortium/Legal/privacy-statement-20000612.html#Public">public</a> and
       <a href="http://www.w3.org/Consortium/Legal/privacy-statement-20000612.html#Members">Member</a> privacy statements.</p>
    </div>
  </body>
</html>
EOEnd

# end of program