#!/usr/bin/perl use strict; use utf8; my $rules = [ [[ 0x00, 0x7f], [0x00], [0x7f]], [[ 0x80, 0x7ff], [0xc2, 0x80], [0xdf, 0xbf]], [[ 0x800, 0xfff], [0xe0, 0xa0, 0x80], [0xe0, 0xbf, 0xbf]], [[ 0x1000, 0xcfff], [0xe1, 0x80, 0x80], [0xec, 0xbf, 0xbf]], [[ 0x1000, 0xcfff], [0xe1, 0x80, 0x80], [0xec, 0xbf, 0xbf]], [[ 0xd000, 0xd7ff], [0xed, 0x80, 0x80], [0xed, 0x9f, 0xbf]], [[ 0xe000, 0xffff], [0xee, 0x80, 0x80], [0xef, 0xbf, 0xbf]], [[ 0x10000, 0x3ffff], [0xf0, 0x90, 0x80, 0x80], [0xf0, 0xbf, 0xbf, 0xbf]], [[ 0x40000, 0xfffff], [0xf1, 0x80, 0x80, 0x80], [0xf3, 0xbf, 0xbf, 0xbf]], [[0x100000, 0x10ffff], [0xf4, 0x80, 0x80, 0x80], [0xf4, 0xbf, 0xbf, 0xbf]], ]; # Some tests. print &compileUTF8range("\x{71}", "\x{9fa5}"), "\n"; print &compileUTF8range("\x{4e01}", "\x{9fa5}"), "\n"; print &compileUTF8range("\x{40001}", "\x{10ffff}"), "\n"; print &compileUTF8range("\x{10ffdf}", "\x{10ffe2}"), "\n"; print &compileUTF8range("\x{30ca}", "\x{3105}"), "\n"; sub compileUTF8range { my ($bot, $top) = @_; my $botRuleNo = &find($bot); my $topRuleNo = &find($top); my $ret; if ($botRuleNo == $topRuleNo) { $ret = &expandRange($botRuleNo, $bot, $top); } else { my $bRule = $rules->[$botRuleNo]; my $tRule = $rules->[$topRuleNo]; my @ranges; my $t; # Expand the bottom rule (including stair-steps) (undef, undef, $t) = &expandRange($botRuleNo, $bot, chr($bRule->[0][1])); push (@ranges, $t); # Expland each rule in between the bottom and top. for (my $i = $botRuleNo+1; $i < $topRuleNo; $i++) { (undef, undef, $t) = &expandRange($i, chr($rules->[$i][0][0]), chr($rules->[$i][0][1])); push (@ranges, $t); } # Expand the top rule. (undef, undef, $t) = &expandRange($topRuleNo, chr($tRule->[0][0]), $top); push (@ranges, $t); $ret = join('|', @ranges);; } return $ret; } # Find which rule subsumes the codepoint. sub find { my ($point) = @_; for (my $i = 0; $i < @$rules; $i++) { if (ord($point) >= $rules->[$i][0][0] && ord($point) <= $rules->[$i][0][1]) { return $i; } } die; } # Expand the characters between $bot and $top. ruleNo must subsume both characters. sub expandRange { my ($ruleNo, $bot, $top) = @_; &utf8::encode($bot); &utf8::encode($top); my $botBytes = [unpack('C*', $bot)]; my $topBytes = [unpack('C*', $top)]; my $bytes = scalar @$botBytes; my $rule = $rules->[$ruleNo]; return &walk([@{$rule->[1]}], [@{$rule->[2]}], $botBytes, $topBytes); } sub walk { my ($bRules, $tRules, $botBytes, $topBytes) = @_; my $bRule = shift (@$bRules); my $tRule = shift (@$tRules); my $botByte = shift (@$botBytes); my $topByte = shift (@$topBytes); my $b = &char($botByte); my $t = &char($topByte); my $rb = &char($bRule); my $rt = &char($tRule); if (!@$bRules) { return ($botByte != $bRule, $topByte != $tRule, $botByte == $topByte ? "$b" : "[$b-$t]"); } # Do a tentative walk of remaining chars. Sets $splitB and $splitT. my ($splitB, $splitT, $range) = &walk([@$bRules], [@$tRules], [@$botBytes], [@$topBytes]); if ($botByte == $topByte) { return ($splitB, $splitT, "($b, $range)"); } my @ret; my $topStr = ''; if ($splitB) { # Walk the bottom as if the top char is the top rule. my (undef, undef, $bRange) = &walk([@$bRules], [@$tRules], [@$botBytes], [@$tRules]); push (@ret,"($b, $bRange)"); $botByte++; $b = &char($botByte); } elsif ($botByte > $bRule) { $splitB = 1; # our callers have to worry about stair-stepping } if ($splitT) { # Walk the bottom as if the bottom char is the bottom rule. my (undef, undef, $tRange) = &walk([@$bRules], [@$tRules], [@$bRules], [@$topBytes]); $topStr = "($t, $tRange)"; $topByte--; $t = &char($topByte); } elsif ($topByte > $tRule) { $splitT = 1; # our callers have to worry about stair-stepping } # Walk intermediate sub-rules. if ($topByte > $botByte) { my $r1 = &walkFull([@$bRules], [@$tRules]); push (@ret, "([$b-$t], $r1)"); } if ($topStr) { push (@ret, $topStr); } return ($splitB, $splitT, join("|", @ret)); } sub char { my ($ch) = @_; my $hex = sprintf ("%02X", $ch); return "\\u$hex"; } # Include all chars between $bot and $top . sub walkFull { my ($bot, $top) = @_; my $b = &char(shift @$bot); my $t = &char(shift @$top); my $ret = $b eq $t ? $b : "[$b-$t]"; if (@$bot) { my $r = &walkFull($bot, $top); return "$ret, $r"; } else { return $ret; } }