#!/usr/bin/perl
use strict;
use utf8;

my $rules = 
    [
     [[    0x00,     0x7f], [0x00],  
			    [0x7f]], 
     [[    0x80,    0x7ff], [0xc2, 0x80], 
			    [0xdf, 0xbf]], 
     [[   0x800,    0xfff], [0xe0, 0xa0, 0x80], 
			    [0xe0, 0xbf, 0xbf]], 
     [[  0x1000,   0xcfff], [0xe1, 0x80, 0x80],  
			    [0xec, 0xbf, 0xbf]], 
     [[  0x1000,   0xcfff], [0xe1, 0x80, 0x80],  
			    [0xec, 0xbf, 0xbf]], 
     [[  0xd000,   0xd7ff], [0xed, 0x80, 0x80],  
			    [0xed, 0x9f, 0xbf]], 
     [[  0xe000,   0xffff], [0xee, 0x80, 0x80],  
			    [0xef, 0xbf, 0xbf]], 
     [[ 0x10000,  0x3ffff], [0xf0, 0x90, 0x80, 0x80],  
			    [0xf0, 0xbf, 0xbf, 0xbf]], 
     [[ 0x40000,  0xfffff], [0xf1, 0x80, 0x80, 0x80],  
			    [0xf3, 0xbf, 0xbf, 0xbf]], 
     [[0x100000, 0x10ffff], [0xf4, 0x80, 0x80, 0x80],  
			    [0xf4, 0xbf, 0xbf, 0xbf]], 
     ];

# Some tests.
print &compileUTF8range("\x{71}", "\x{9fa5}"), "\n";
print &compileUTF8range("\x{4e01}", "\x{9fa5}"), "\n";
print &compileUTF8range("\x{40001}", "\x{10ffff}"), "\n";
print &compileUTF8range("\x{10ffdf}", "\x{10ffe2}"), "\n";
print &compileUTF8range("\x{30ca}", "\x{3105}"), "\n";

sub compileUTF8range {
    my ($bot, $top) = @_;
    my $botRuleNo = &find($bot);
    my $topRuleNo = &find($top);
    my $ret;
    if ($botRuleNo == $topRuleNo) {
	$ret = &expandRange($botRuleNo, $bot, $top);
    } else {
	my $bRule = $rules->[$botRuleNo];
	my $tRule = $rules->[$topRuleNo];

	my @ranges;
	my $t;

	# Expand the bottom rule (including stair-steps)
	(undef, undef, $t) = &expandRange($botRuleNo, $bot, chr($bRule->[0][1]));
	push (@ranges, $t);

	# Expland each rule in between the bottom and top.
	for (my $i = $botRuleNo+1; $i < $topRuleNo; $i++) {
	    (undef, undef, $t) = &expandRange($i, chr($rules->[$i][0][0]), chr($rules->[$i][0][1]));
	    push (@ranges, $t);
	}

	# Expand the top rule.
	(undef, undef, $t) = &expandRange($topRuleNo, chr($tRule->[0][0]), $top);
	push (@ranges, $t);
	$ret = join('|', @ranges);;
    }
    return $ret;
}

# Find which rule subsumes the codepoint.
sub find {
    my ($point) = @_;
    for (my $i = 0; $i < @$rules; $i++) {
	if (ord($point) >= $rules->[$i][0][0] && 
	    ord($point) <= $rules->[$i][0][1]) {
	    return $i;
	}
    }
    die;
}

# Expand the characters between $bot and $top. ruleNo must subsume both characters.
sub expandRange {
    my ($ruleNo, $bot, $top) = @_;
    &utf8::encode($bot);
    &utf8::encode($top);
    my $botBytes = [unpack('C*', $bot)];
    my $topBytes = [unpack('C*', $top)];
    my $bytes = scalar @$botBytes;
    my $rule = $rules->[$ruleNo];
    return &walk([@{$rule->[1]}], [@{$rule->[2]}], $botBytes, $topBytes);
}

sub walk {
    my ($bRules, $tRules, $botBytes, $topBytes) = @_;
    my $bRule = shift (@$bRules);
    my $tRule = shift (@$tRules);
    my $botByte = shift (@$botBytes);
    my $topByte = shift (@$topBytes);
    my $b = &char($botByte);
    my $t = &char($topByte);
    my $rb = &char($bRule);
    my $rt = &char($tRule);

    if (!@$bRules) {
	return ($botByte != $bRule, $topByte != $tRule, $botByte == $topByte ? "$b" : "[$b-$t]");
    }

    # Do a tentative walk of remaining chars. Sets $splitB and $splitT.
    my ($splitB, $splitT, $range) = &walk([@$bRules], [@$tRules], [@$botBytes], [@$topBytes]);

    if ($botByte == $topByte) {
	return ($splitB, $splitT, "($b, $range)");
    }

    my @ret;
    my $topStr = '';

    if ($splitB) {
	# Walk the bottom as if the top char is the top rule.
	my (undef, undef, $bRange) = &walk([@$bRules], [@$tRules], [@$botBytes], [@$tRules]);
	push (@ret,"($b, $bRange)");
	$botByte++;
	$b = &char($botByte);
    } elsif ($botByte > $bRule) {
	$splitB = 1; # our callers have to worry about stair-stepping
    }

    if ($splitT) {
	# Walk the bottom as if the bottom char is the bottom rule.
	my (undef, undef, $tRange) = &walk([@$bRules], [@$tRules], [@$bRules], [@$topBytes]);
	$topStr = "($t, $tRange)";
	$topByte--;
	$t = &char($topByte);
    } elsif ($topByte > $tRule) {
	$splitT = 1; # our callers have to worry about stair-stepping
    }

    # Walk intermediate sub-rules.
    if ($topByte > $botByte) {
	my $r1 = &walkFull([@$bRules], [@$tRules]);
	push (@ret, "([$b-$t], $r1)");
    }
    if ($topStr) {
	push (@ret, $topStr);
    }
    return ($splitB, $splitT, join("|", @ret));
}

sub char {
    my ($ch) = @_;
    my $hex = sprintf ("%02X", $ch);
    return "\\u$hex";
}

# Include all chars between $bot and $top .
sub walkFull {
    my ($bot, $top) = @_;
    my $b = &char(shift @$bot);
    my $t = &char(shift @$top);
    my $ret = $b eq $t ? $b : "[$b-$t]";
    if (@$bot) {
	my $r = &walkFull($bot, $top);
	return "$ret, $r";
    } else {
	return $ret;
    }
}

